diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/device_clock.cc | 73 | ||||
| -rw-r--r-- | src/device_clock.hh | 49 | ||||
| -rw-r--r-- | src/device_context.cc | 107 | ||||
| -rw-r--r-- | src/device_context.hh | 56 | ||||
| -rw-r--r-- | src/helper.cc | 5 | ||||
| -rw-r--r-- | src/helper.hh | 59 | ||||
| -rw-r--r-- | src/layer.cc | 141 | ||||
| -rw-r--r-- | src/physical_device_context.hh | 8 | ||||
| -rw-r--r-- | src/queue_context.cc | 352 | ||||
| -rw-r--r-- | src/queue_context.hh | 96 | ||||
| -rw-r--r-- | src/swapchain_monitor.cc | 112 | ||||
| -rw-r--r-- | src/swapchain_monitor.hh | 69 | ||||
| -rw-r--r-- | src/timestamp_pool.cc | 54 | ||||
| -rw-r--r-- | src/timestamp_pool.hh | 20 |
14 files changed, 575 insertions, 626 deletions
diff --git a/src/device_clock.cc b/src/device_clock.cc new file mode 100644 index 0000000..52c86d3 --- /dev/null +++ b/src/device_clock.cc @@ -0,0 +1,73 @@ +#include "device_clock.hh" +#include "device_context.hh" + +#include <vulkan/vulkan_core.h> + +#include <cassert> +#include <time.h> + +namespace low_latency { + +DeviceClock::DeviceClock(const DeviceContext& context) : device(context) { + this->calibrate(); +} + +DeviceClock::~DeviceClock() {} + +DeviceClock::time_point_t DeviceClock::now() { + auto ts = timespec{}; + if (clock_gettime(CLOCK_MONOTONIC, &ts)) { + throw errno; + } + + return time_point_t{std::chrono::seconds{ts.tv_sec} + + std::chrono::nanoseconds{ts.tv_nsec}}; +} + +void DeviceClock::calibrate() { + const auto infos = std::vector<VkCalibratedTimestampInfoKHR>{ + {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, + VK_TIME_DOMAIN_DEVICE_EXT}, + {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, + VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT}}; + + struct CalibratedResult { + std::uint64_t device; + std::uint64_t host; + }; + auto calibrated_result = CalibratedResult{}; + + THROW_NON_VKSUCCESS(device.vtable.GetCalibratedTimestampsKHR( + device.device, 2, std::data(infos), &calibrated_result.device, + &this->error_bound)); + + this->device_ticks = calibrated_result.device; + this->host_ns = calibrated_result.host; +} + +DeviceClock::time_point_t +DeviceClock::ticks_to_time(const std::uint64_t& ticks) const { + const auto& pd = device.physical_device.properties; + const auto ns_tick = static_cast<double>(pd->limits.timestampPeriod); + + const auto diff = [&]() -> auto { + auto a = this->device_ticks; + auto b = ticks; + const auto is_negative = a > b; + if (is_negative) { + std::swap(a, b); + } + const auto abs_diff = b - a; + assert(abs_diff <= std::numeric_limits<std::int64_t>::max()); + const auto signed_abs_diff = static_cast<std::int64_t>(abs_diff); + return is_negative ? -signed_abs_diff : signed_abs_diff; + }(); + + const auto diff_nsec = + static_cast<std::int64_t>(static_cast<double>(diff) * ns_tick + 0.5); + const auto delta = std::chrono::nanoseconds( + this->host_ns + static_cast<std::uint64_t>(diff_nsec)); + return time_point_t{delta}; +} + +} // namespace low_latency
\ No newline at end of file diff --git a/src/device_clock.hh b/src/device_clock.hh new file mode 100644 index 0000000..a52c59c --- /dev/null +++ b/src/device_clock.hh @@ -0,0 +1,49 @@ +#ifndef CLOCK_HH_ +#define CLOCK_HH_ + +#include <chrono> + +// This header provides a DeviceClock that abstracts away the Vulkan details of +// comparing CPU and GPU times. + +namespace low_latency { + +class DeviceContext; + +class DeviceClock final { + public: + // FIXME this is bad, see now(). + using time_point_t = std::chrono::time_point<std::chrono::steady_clock, + std::chrono::nanoseconds>; + const DeviceContext& device; + + public: + std::uint64_t host_ns; + std::uint64_t error_bound; + std::uint64_t device_ticks; + + public: + DeviceClock(const DeviceContext& device); + DeviceClock(const DeviceClock&) = delete; + DeviceClock(DeviceClock&&) = delete; + DeviceClock operator=(const DeviceClock&) = delete; + DeviceClock operator=(DeviceClock&&) = delete; + ~DeviceClock(); + + public: + // WARNING: This *MUST* be used over std::chrono::steady_clock::now if + // you're planning on comparing it to a device's clock. If it isn't, the + // timestamps might from different domains and will be completely + // nonsensical. + // FIXME we should be able to fix this with a tiny wrapper class of + // time_point_t that enforces typesafety. + static time_point_t now(); + + public: + void calibrate(); + time_point_t ticks_to_time(const std::uint64_t& ticks) const; +}; + +} // namespace low_latency + +#endif
\ No newline at end of file diff --git a/src/device_context.cc b/src/device_context.cc index 58737e2..5438e40 100644 --- a/src/device_context.cc +++ b/src/device_context.cc @@ -1,6 +1,5 @@ #include "device_context.hh" -#include <time.h> #include <utility> #include <vulkan/vulkan_core.h> @@ -9,15 +8,15 @@ namespace low_latency { DeviceContext::DeviceContext(InstanceContext& parent_instance, PhysicalDeviceContext& parent_physical_device, const VkDevice& device, - const bool was_antilag_requested, + const bool was_capability_requested, VkuDeviceDispatchTable&& vtable) : instance(parent_instance), physical_device(parent_physical_device), - was_antilag_requested(was_antilag_requested), device(device), + was_capability_requested(was_capability_requested), device(device), vtable(std::move(vtable)) { // Only create our clock if we can support creating it. if (this->physical_device.supports_required_extensions) { - this->clock = std::make_unique<Clock>(*this); + this->clock = std::make_unique<DeviceClock>(*this); } } @@ -29,72 +28,10 @@ DeviceContext::~DeviceContext() { } } -DeviceContext::Clock::Clock(const DeviceContext& context) : device(context) { - this->calibrate(); -} - -DeviceContext::Clock::~Clock() {} - -DeviceContext::Clock::time_point_t DeviceContext::Clock::now() { - auto ts = timespec{}; - if (clock_gettime(CLOCK_MONOTONIC, &ts)) { - throw errno; - } - - return time_point_t{std::chrono::seconds{ts.tv_sec} + - std::chrono::nanoseconds{ts.tv_nsec}}; -} - -void DeviceContext::Clock::calibrate() { - const auto infos = std::vector<VkCalibratedTimestampInfoKHR>{ - {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, - VK_TIME_DOMAIN_DEVICE_EXT}, - {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, - VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT}}; - - struct CalibratedResult { - std::uint64_t device; - std::uint64_t host; - }; - auto calibrated_result = CalibratedResult{}; - - THROW_NON_VKSUCCESS(device.vtable.GetCalibratedTimestampsKHR( - device.device, 2, std::data(infos), &calibrated_result.device, - &this->error_bound)); - - this->device_ticks = calibrated_result.device; - this->host_ns = calibrated_result.host; -} - -DeviceContext::Clock::time_point_t -DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const { - const auto& pd = device.physical_device.properties; - const auto ns_tick = static_cast<double>(pd->limits.timestampPeriod); - - const auto diff = [&]() -> auto { - auto a = this->device_ticks; - auto b = ticks; - const auto is_negative = a > b; - if (is_negative) { - std::swap(a, b); - } - const auto abs_diff = b - a; - assert(abs_diff <= std::numeric_limits<std::int64_t>::max()); - const auto signed_abs_diff = static_cast<std::int64_t>(abs_diff); - return is_negative ? -signed_abs_diff : signed_abs_diff; - }(); - - const auto diff_nsec = - static_cast<std::int64_t>(static_cast<double>(diff) * ns_tick + 0.5); - const auto delta = std::chrono::nanoseconds( - this->host_ns + static_cast<std::uint64_t>(diff_nsec)); - return time_point_t{delta}; -} - +/* void DeviceContext::sleep_in_input() { // TODO - /* // Present hasn't happened yet, we don't know what queue to attack. if (!this->present_queue) { return; @@ -122,30 +59,36 @@ void DeviceContext::sleep_in_input() { // would get huge frame drops, loss of throughput, and the GPU would even // clock down. So naturally I am concerned about this approach, but it seems // to perform well so far in my own testing and is just beautifully elegant. - */ } +*/ -void DeviceContext::update_swapchain_infos( +void DeviceContext::update_params( const std::optional<VkSwapchainKHR> target, const std::chrono::milliseconds& present_delay, const bool was_low_latency_requested) { - const auto write = SwapchainInfo{ - .present_delay = present_delay, - .was_low_latency_requested = was_low_latency_requested, - }; - - if (target.has_value()) { - const auto iter = this->swapchain_infos.find(*target); - assert(iter != std::end(this->swapchain_infos)); // Must exist (spec). - iter->second = write; + // If we don't have a target (AMD's anti_lag doesn't differentiate between + // swapchains), just write it to everything. + if (!target.has_value()) { + for (auto& iter : this->swapchain_monitors) { + iter.second.update_params(was_low_latency_requested, present_delay); + } return; } - // If we don't have a target (AMD's anti_lag), just write it to everything. - for (auto& iter : this->swapchain_infos) { - iter.second = write; - } + const auto iter = this->swapchain_monitors.find(*target); + assert(iter != std::end(this->swapchain_monitors)); + iter->second.update_params(was_low_latency_requested, present_delay); +} + +void DeviceContext::notify_present( + const VkSwapchainKHR& swapchain, + const QueueContext::submissions_t& submissions) { + + const auto iter = this->swapchain_monitors.find(swapchain); + assert(iter != std::end(this->swapchain_monitors)); + + iter->second.notify_present(submissions); } } // namespace low_latency
\ No newline at end of file diff --git a/src/device_context.hh b/src/device_context.hh index 6b5f000..172801c 100644 --- a/src/device_context.hh +++ b/src/device_context.hh @@ -11,59 +11,30 @@ #include <vulkan/vulkan_core.h> #include "context.hh" +#include "device_clock.hh" #include "instance_context.hh" #include "physical_device_context.hh" +#include "queue_context.hh" +#include "swapchain_monitor.hh" namespace low_latency { -class QueueContext; - -struct DeviceContext final : public Context { +class DeviceContext final : public Context { public: InstanceContext& instance; PhysicalDeviceContext& physical_device; - const bool was_antilag_requested; + // Whether or not we were asked to do NV_VK_LowLatency2 or VK_AMD_anti_lag. + const bool was_capability_requested; const VkDevice device; const VkuDeviceDispatchTable vtable; - // Tiny struct to represent any swapchain's low latency state. - struct SwapchainInfo { - std::chrono::milliseconds present_delay = std::chrono::milliseconds{0}; - bool was_low_latency_requested = false; - }; - std::unordered_map<VkSwapchainKHR, SwapchainInfo> swapchain_infos{}; - std::unordered_map<VkQueue, std::shared_ptr<QueueContext>> queues; - struct Clock { - public: - using time_point_t = std::chrono::time_point<std::chrono::steady_clock, - std::chrono::nanoseconds>; - const DeviceContext& device; - - public: - std::uint64_t host_ns; - std::uint64_t error_bound; - std::uint64_t device_ticks; - - public: - Clock(const DeviceContext& device); - ~Clock(); + std::unique_ptr<DeviceClock> clock; - public: - // WARNING: This *MUST* be used over std::chrono::steady_clock::now if - // you're planning on comparing it to a device's clock. If it isn't, the - // timestamps might from different domains and will be completely - // nonsensical. - static time_point_t now(); - - public: - void calibrate(); - time_point_t ticks_to_time(const std::uint64_t& ticks) const; - }; - std::unique_ptr<Clock> clock; + std::unordered_map<VkSwapchainKHR, SwapchainMonitor> swapchain_monitors; public: DeviceContext(InstanceContext& parent_instance, @@ -73,13 +44,14 @@ struct DeviceContext final : public Context { virtual ~DeviceContext(); public: - void sleep_in_input(); - // Updates the settings associated with that swapchain. If none is provided // all swapchains are set to this value. - void update_swapchain_infos(const std::optional<VkSwapchainKHR> target, - const std::chrono::milliseconds& present_delay, - const bool was_low_latency_requested); + void update_params(const std::optional<VkSwapchainKHR> target, + const std::chrono::milliseconds& present_delay, + const bool was_low_latency_requested); + + void notify_present(const VkSwapchainKHR& swapchain, + const QueueContext::submissions_t& submissions); }; }; // namespace low_latency diff --git a/src/helper.cc b/src/helper.cc new file mode 100644 index 0000000..bb17c59 --- /dev/null +++ b/src/helper.cc @@ -0,0 +1,5 @@ +#include "helper.hh" + +namespace low_latency { + +}
\ No newline at end of file diff --git a/src/helper.hh b/src/helper.hh new file mode 100644 index 0000000..468f146 --- /dev/null +++ b/src/helper.hh @@ -0,0 +1,59 @@ +#ifndef HELPER_HH_ +#define HELPER_HH_ + +#include <vulkan/vk_layer.h> +#include <vulkan/vulkan.h> + +#include <cstdint> + +namespace low_latency { + +// Small templates which allow us to SFINAE find pNext structs. +template <typename T> +static T* find_next(void* const head, const VkStructureType& stype) { + for (auto i = reinterpret_cast<VkBaseOutStructure*>(head)->pNext; i; + i = i->pNext) { + + if (i->sType == stype) { + return reinterpret_cast<T*>(i); + } + } + return nullptr; +} + +template <typename T> +static const T* find_next(const void* const head, + const VkStructureType& stype) { + + for (auto i = reinterpret_cast<const VkBaseInStructure*>(head)->pNext; i; + i = i->pNext) { + + if (i->sType == stype) { + return reinterpret_cast<const T*>(i); + } + } + return nullptr; +} + +template <typename T> +static const T* find_link(const void* const head, + const VkStructureType& stype) { + for (auto info = find_next<T>(head, stype); info; + info = find_next<T>(info, stype)) { + + if (info->function == VK_LAYER_LINK_INFO) { + return reinterpret_cast<const T*>(info); + } + } + return nullptr; +} + +template <typename T> std::uint64_t extract_present_id(const T& submit) { + const auto lspi = find_next<VkLatencySubmissionPresentIdNV>( + &submit, VK_STRUCTURE_TYPE_LATENCY_SUBMISSION_PRESENT_ID_NV); + return lspi ? lspi->presentID : 0; +} + +} // namespace low_latency + +#endif
\ No newline at end of file diff --git a/src/layer.cc b/src/layer.cc index 5460fca..7a7ffc8 100644 --- a/src/layer.cc +++ b/src/layer.cc @@ -14,7 +14,9 @@ #include <vulkan/vulkan.hpp> #include <vulkan/vulkan_core.h> +#include "device_clock.hh" #include "device_context.hh" +#include "helper.hh" #include "instance_context.hh" #include "layer_context.hh" #include "queue_context.hh" @@ -28,46 +30,6 @@ LayerContext layer_context; } // namespace -// Small templates which allow us to SFINAE find pNext structs. -template <typename T> -static T* find_next(void* const head, const VkStructureType& stype) { - for (auto i = reinterpret_cast<VkBaseOutStructure*>(head)->pNext; i; - i = i->pNext) { - - if (i->sType == stype) { - return reinterpret_cast<T*>(i); - } - } - return nullptr; -} - -template <typename T> -static const T* find_next(const void* const head, - const VkStructureType& stype) { - - for (auto i = reinterpret_cast<const VkBaseInStructure*>(head)->pNext; i; - i = i->pNext) { - - if (i->sType == stype) { - return reinterpret_cast<const T*>(i); - } - } - return nullptr; -} - -template <typename T> -static const T* find_link(const void* const head, - const VkStructureType& stype) { - for (auto info = find_next<T>(head, stype); info; - info = find_next<T>(info, stype)) { - - if (info->function == VK_LAYER_LINK_INFO) { - return reinterpret_cast<const T*>(info); - } - } - return nullptr; -} - static VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkInstance* pInstance) { @@ -209,12 +171,12 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice( // is not the case with AL2, because the vulkan application has to // explicitly ask for the extension when it creates the device. - const auto was_antilag_requested = + const auto was_capability_requested = requested.contains(VK_AMD_ANTI_LAG_EXTENSION_NAME) || requested.contains(VK_NV_LOW_LATENCY_2_EXTENSION_NAME); const auto context = layer_context.get_context(physical_device); - if (!context->supports_required_extensions && was_antilag_requested) { + if (!context->supports_required_extensions && was_capability_requested) { return VK_ERROR_INITIALIZATION_FAILED; } @@ -305,7 +267,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice( assert(!layer_context.contexts.contains(key)); layer_context.contexts.try_emplace( key, std::make_shared<DeviceContext>(context->instance, *context, - *pDevice, was_antilag_requested, + *pDevice, was_capability_requested, std::move(vtable))); return VK_SUCCESS; @@ -443,7 +405,7 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count, // more explicit + insurance if that changes. auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{}; - const auto now = DeviceContext::Clock::now(); + const auto now = DeviceClock::now(); std::ranges::transform( std::span{submit_infos, submit_count}, std::back_inserter(next_submits), @@ -451,7 +413,9 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count, const auto head_handle = context->timestamp_pool->acquire(); const auto tail_handle = context->timestamp_pool->acquire(); head_handle->setup_command_buffers(*tail_handle, *context); - context->notify_submit(submit, head_handle, tail_handle, now); + + context->notify_submit(extract_present_id(submit), head_handle, + tail_handle, now); handles.emplace_back(head_handle); handles.emplace_back(tail_handle); @@ -494,7 +458,7 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count, auto next_cbs = std::vector<std::unique_ptr<cbs_t>>{}; auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{}; - const auto now = DeviceContext::Clock::now(); + const auto now = DeviceClock::now(); std::ranges::transform( std::span{submit_infos, submit_count}, std::back_inserter(next_submits), @@ -502,7 +466,9 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count, const auto head_handle = context->timestamp_pool->acquire(); const auto tail_handle = context->timestamp_pool->acquire(); head_handle->setup_command_buffers(*tail_handle, *context); - context->notify_submit(submit, head_handle, tail_handle, now); + + context->notify_submit(extract_present_id(submit), head_handle, + tail_handle, now); handles.emplace_back(head_handle); handles.emplace_back(tail_handle); @@ -553,7 +519,14 @@ vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) { return res; } - context->notify_present(*present_info); + const auto pid = find_next<VkPresentIdKHR>( + present_info, VK_STRUCTURE_TYPE_PRESENT_ID_KHR); + + for (auto i = std::uint32_t{0}; i < present_info->swapchainCount; ++i) { + const auto& swapchain = present_info->pSwapchains[i]; + const auto present_id = pid ? pid->pPresentIds[i] : 0; + context->notify_present(swapchain, present_id); + } return VK_SUCCESS; } @@ -644,6 +617,17 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2( vtable.GetPhysicalDeviceFeatures2(physical_device, pFeatures); + // We're going to use this feature for both VK_AMD_anti_lag and + // VK_NV_low_latency2. It simplifies things a bit if we share a code path + // for now. TODO remove it in the future for VK_AMD_anti_lag. + if (const auto pidf = find_next<VkPhysicalDevicePresentIdFeaturesKHR>( + pFeatures, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENT_ID_FEATURES_KHR); + pidf) { + + pidf->presentId = true; + } + // Don't provide AntiLag if we're trying to spoof nvidia. // Nvidia uses VkSurfaceCapabilities2KHR to determine if a surface // is capable of reflex instead of AMD's physical device switch found here. @@ -651,11 +635,11 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2( return; } - const auto feature = find_next<VkPhysicalDeviceAntiLagFeaturesAMD>( - pFeatures, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD); + if (const auto alf = find_next<VkPhysicalDeviceAntiLagFeaturesAMD>( + pFeatures, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD); + alf) { - if (feature) { - feature->antiLag = context->supports_required_extensions; + alf->antiLag = context->supports_required_extensions; } } @@ -707,12 +691,11 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR( const auto lsc = find_next<VkLatencySurfaceCapabilitiesNV>( pSurfaceCapabilities, VK_STRUCTURE_TYPE_LATENCY_SURFACE_CAPABILITIES_NV); - if (!lsc) { return; } - // I kind of eyeballed these! + // I eyeballed these - there might be more that we can support. const auto supported_modes = std::vector<VkPresentModeKHR>{ VK_PRESENT_MODE_IMMEDIATE_KHR, VK_PRESENT_MODE_MAILBOX_KHR, @@ -723,7 +706,7 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR( // They're asking how many we want to return. if (!lsc->pPresentModes) { - lsc->presentModeCount = static_cast<std::uint32_t>(num_supported_modes); + lsc->presentModeCount = num_supported_modes; return; } @@ -750,19 +733,17 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateSwapchainKHR( return result; } - auto addition = DeviceContext::SwapchainInfo{ - .present_delay = std::chrono::milliseconds{0}, - .was_low_latency_requested = false, - }; - + // VK_NV_low_latency2 allows a swapchain to be created with the low latency + // mode already on via VkSwapchainLatencyCreateInfoNV. + auto was_low_latency_requested = false; if (const auto slci = find_next<VkSwapchainLatencyCreateInfoNV>( pCreateInfo, VK_STRUCTURE_TYPE_SWAPCHAIN_LATENCY_CREATE_INFO_NV); slci) { - - addition.was_low_latency_requested = slci->latencyModeEnable; - } - assert(context->swapchain_infos.try_emplace(*pSwapchain, addition).second); + was_low_latency_requested = slci->latencyModeEnable; + } + context->swapchain_monitors.try_emplace(*pSwapchain, *context, + was_low_latency_requested); return VK_SUCCESS; } @@ -772,7 +753,7 @@ DestroySwapchainKHR(VkDevice device, VkSwapchainKHR swapchain, const VkAllocationCallbacks* pAllocator) { const auto context = layer_context.get_context(device); - assert(context->swapchain_infos.erase(swapchain)); + assert(context->swapchain_monitors.erase(swapchain)); context->vtable.DestroySwapchainKHR(device, swapchain, pAllocator); } @@ -788,20 +769,20 @@ AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) { // NVIDIA's method and then have a working AL2 implementation follow using // that existing code path. - const auto present_delay = [&]() { // lambda abuse? + const auto present_delay = [&]() { using namespace std::chrono; return duration_cast<milliseconds>(1s / pData->maxFPS); }(); - context->update_swapchain_infos(std::nullopt, present_delay, - (pData->mode == VK_ANTI_LAG_MODE_ON_AMD)); + context->update_params(std::nullopt, present_delay, + (pData->mode == VK_ANTI_LAG_MODE_ON_AMD)); if (!pData->pPresentationInfo) { return; } if (pData->pPresentationInfo->stage == VK_ANTI_LAG_STAGE_INPUT_AMD) { - context->sleep_in_input(); + // TODO use nvidia's path } } @@ -811,16 +792,25 @@ VkResult LatencySleepNV(VkDevice device, VkSwapchainKHR swapchain, const auto context = layer_context.get_context(device); assert(pSleepInfo); - // TODO sleep here + // We're associating an application-provided timeline semaphore + value with + // a swapchain that says 'signal me when we should move past input'. + auto& swapchain_monitor = [&]() -> auto& { + const auto iter = context->swapchain_monitors.find(swapchain); + assert(iter != std::end(context->swapchain_monitors)); + return iter->second; + }(); + + // Tell our swapchain monitor that if they want us to proceed they should + // signal this semaphore. + swapchain_monitor.notify_semaphore(pSleepInfo->signalSemaphore, + pSleepInfo->value); return VK_SUCCESS; } void QueueNotifyOutOfBandNV(VkQueue queue, const VkOutOfBandQueueTypeInfoNV* pQueueTypeInfo) { - // This is really thoughtful from NVIDIA. Having the application explicitly - // state which queues should be ignored for latency evaluation is far - // superior to AMD's guessing game. + // Kind of interesting how you can't turn it back on once it's turned off. // Also I really have no idea why pQueueTypeInfo's VkOutOfBandQueueTypeNV // enum even exists (I guess we will find out later when nothing works). @@ -834,14 +824,13 @@ VkResult SetLatencySleepModeNV(VkDevice device, VkSwapchainKHR swapchain, const auto context = layer_context.get_context(device); if (pSleepModeInfo) { - context->update_swapchain_infos( + context->update_params( swapchain, std::chrono::milliseconds{pSleepModeInfo->minimumIntervalUs}, pSleepModeInfo->lowLatencyMode); } else { // If pSleepModeInfo is nullptr, it means no delay and no low latency. - context->update_swapchain_infos(swapchain, std::chrono::milliseconds{0}, - false); + context->update_params(swapchain, std::chrono::milliseconds{0}, false); } return VK_SUCCESS; } diff --git a/src/physical_device_context.hh b/src/physical_device_context.hh index 9624faa..f7ad289 100644 --- a/src/physical_device_context.hh +++ b/src/physical_device_context.hh @@ -4,6 +4,7 @@ #include "instance_context.hh" #include <vulkan/vulkan.hpp> +#include <vulkan/vulkan_core.h> #include "context.hh" @@ -17,7 +18,8 @@ class PhysicalDeviceContext final : public Context { static constexpr auto required_extensions = { VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME, VK_KHR_CALIBRATED_TIMESTAMPS_EXTENSION_NAME, - VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME}; + VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME, + VK_KHR_PRESENT_ID_EXTENSION_NAME}; public: InstanceContext& instance; @@ -29,8 +31,8 @@ class PhysicalDeviceContext final : public Context { using queue_properties_t = std::vector<VkQueueFamilyProperties2>; std::unique_ptr<const queue_properties_t> queue_properties; - // Will be set to true in the constructor if the physical device supports - // everything we need to track gpu timing data. + // Will be true if the physical device supports everything in + // this->required_extensions. bool supports_required_extensions = false; public: diff --git a/src/queue_context.cc b/src/queue_context.cc index d12f03d..30e73c1 100644 --- a/src/queue_context.cc +++ b/src/queue_context.cc @@ -3,11 +3,6 @@ #include "layer_context.hh" #include "timestamp_pool.hh" -#include <algorithm> -#include <chrono> -#include <fstream> -#include <iostream> -#include <ranges> #include <span> #include <vulkan/vulkan_core.h> @@ -49,333 +44,52 @@ QueueContext::QueueContext(DeviceContext& device_context, const VkQueue& queue, } QueueContext::~QueueContext() { - this->in_flight_frames.clear(); - this->submissions.clear(); + this->unpresented_submissions.clear(); this->timestamp_pool.reset(); } void QueueContext::notify_submit( - const VkSubmitInfo& info, + const present_id_t& present_id, const std::shared_ptr<TimestampPool::Handle> head_handle, const std::shared_ptr<TimestampPool::Handle> tail_handle, - const DeviceContext::Clock::time_point_t& now) { - - auto signals = std::unordered_set<VkSemaphore>{}; - auto waits = std::unordered_set<VkSemaphore>{}; - std::ranges::copy(std::span{info.pWaitSemaphores, info.waitSemaphoreCount}, - std::inserter(waits, std::end(waits))); - std::ranges::copy( - std::span{info.pSignalSemaphores, info.signalSemaphoreCount}, - std::inserter(signals, std::end(signals))); - - this->submissions.emplace_back(std::make_unique<Submission>( - std::move(signals), std::move(waits), head_handle, tail_handle, now)); - - if (std::size(this->submissions) > this->MAX_TRACKED_SUBMISSIONS) { - this->submissions.pop_front(); - } -} - -// Identical to notify_submit, but we use VkSubmitInfo2. -void QueueContext::notify_submit( - const VkSubmitInfo2& info, - const std::shared_ptr<TimestampPool::Handle> head_handle, - const std::shared_ptr<TimestampPool::Handle> tail_handle, - const DeviceContext::Clock::time_point_t& now) { - - auto signals = std::unordered_set<VkSemaphore>{}; - auto waits = std::unordered_set<VkSemaphore>{}; - - std::ranges::transform( - std::span{info.pWaitSemaphoreInfos, info.waitSemaphoreInfoCount}, - std::inserter(waits, std::end(waits)), - [](const auto& info) -> auto { return info.semaphore; }); - - std::ranges::transform( - std::span{info.pSignalSemaphoreInfos, info.signalSemaphoreInfoCount}, - std::inserter(signals, std::end(signals)), - [](const auto& info) -> auto { return info.semaphore; }); - - this->submissions.emplace_back(std::make_unique<Submission>( - std::move(signals), std::move(waits), head_handle, tail_handle, now)); - - if (std::size(this->submissions) > this->MAX_TRACKED_SUBMISSIONS) { - this->submissions.pop_front(); - } -} - -void QueueContext::drain_submissions_to_frame() { - - // We are going to assume that all queue submissions before and on the same - // queue contribute to the frame. - - // This used to be more complicated where we found the first submission that - // was signalled by acquire, then we walked forwards until we found the - // submission before it that marked the end of frame (which was the last - // submission in the previous frame that called notify submit). This seemed - // completely redundant, in all cases it was exactly what we have here. But - // I could be wrong. - - const auto start_iter = std::begin(this->submissions); - // no op submit? - if (start_iter == std::end(this->submissions)) { - return; + const DeviceClock::time_point_t& now) { + + // Push this submission onto our unpresented_submissions at our present_id + // mapping (might be empty, but handled with operator[]). + auto& submissions = this->unpresented_submissions[present_id]; + if (submissions == nullptr) { + submissions = + std::make_shared<std::deque<std::unique_ptr<Submission>>>(); } - // The last submission is either in flight, already processed, or we - // just happen to be the first frame and we can just set it to our start - // with little consequence. - const auto prev_frame_last_submit = [&]() -> auto { - if (const auto iter = std::rbegin(this->in_flight_frames); - iter != std::rend(this->in_flight_frames)) { - - assert(!iter->submissions.empty()); - return iter->submissions.back(); - } - - if (const auto iter = std::rbegin(this->timings); - iter != std::rend(this->timings)) { + submissions->push_back( + std::make_unique<Submission>(Submission{.head_handle = head_handle, + .tail_handle = tail_handle, + .cpu_present_time = now})); - const auto& submissions = (*iter)->frame.submissions; - assert(!submissions.empty()); - - return submissions.back(); - } - - return *start_iter; - }(); - - this->in_flight_frames.emplace_back( - Frame{.submissions = std::move(this->submissions), - .cpu_post_present_time = DeviceContext::Clock::now()}); - assert(std::size(this->in_flight_frames.back().submissions)); - // *valid but unspecified state after move, so clear!* - this->submissions.clear(); -} - -void QueueContext::notify_present(const VkPresentInfoKHR& info) { - this->drain_submissions_to_frame(); - this->drain_frames_to_timings(); - - // We should only sleep in present if two conditions are met: - // 1. Our antilag_mode isn't set to on, because otherwise the sleep will - // be done in input and with far better results. - // 2. The 'is_antilag_1_enabled' flag, which exists at the layer's - // context, is set. - // - /* - * WIP REFLEX - if (this->device_context.antilag_mode != VK_ANTI_LAG_MODE_ON_AMD && - this->device_context.instance.layer.is_antilag_1_enabled) { - - this->sleep_in_present(); + // This is probably hit if our queue never actually presents to anything, + // because the only time we manually evict our unpresent_submissions is + // when we present to something. + if (std::size(*submissions) > this->MAX_TRACKED_SUBMISSIONS) { + submissions->pop_front(); } - */ } -const auto debug_log_time2 = [](auto& stream, const auto& diff) { - using namespace std::chrono; - const auto ms = duration_cast<milliseconds>(diff); - const auto us = duration_cast<microseconds>(diff - ms); - const auto ns = duration_cast<nanoseconds>(diff - ms - us); - stream << ms << " " << us << " " << ns << " ago\n"; -}; +void QueueContext::notify_present(const VkSwapchainKHR& swapchain, + const present_id_t& present_id) { -void QueueContext::drain_frames_to_timings() { - if (!std::size(this->in_flight_frames)) { - return; + // Notify the device that this swapchain was just presented to. + // We're avoiding a double hash here - don't use operator[] and erase. + auto iter = this->unpresented_submissions.try_emplace(present_id).first; + if (iter->second == nullptr) { + iter->second = + std::make_shared<std::deque<std::unique_ptr<Submission>>>(); } - // Only need to calibrate this device, we don't support multi device anti - // lag. - this->device_context.clock->calibrate(); - - while (std::size(this->in_flight_frames)) { - const auto& frame = this->in_flight_frames.front(); - - assert(std::size(frame.submissions)); - - const auto& last_submission = frame.submissions.back(); - - // Not completed (so future frames definitely aren't) - stop early. - if (!last_submission->end_handle->get_time().has_value()) { - break; - } - - // We are committed to removing the frame at this stage and - // promoting it to a 'timing' struct because it's completed. - // We can guarantee that we can extract timing information from - // all start/end handles now. - - // Using leetcode merge intervals in the wild lol - struct Interval { - DeviceContext::Clock::time_point_t start, end; - }; - - const auto sorted_intervals = [&]() -> auto { - auto intervals = std::vector<Interval>{}; - std::ranges::transform( - frame.submissions, std::back_inserter(intervals), - [&](const auto& submission) { - return Interval{ - .start = submission->start_handle->get_time_required(), - .end = submission->end_handle->get_time_required(), - }; - }); - - std::ranges::sort(intervals, [](const auto& a, const auto& b) { - return a.start < b.start; - }); - return intervals; - }(); - - const auto merged = [&]() -> auto { - auto merged = std::vector<Interval>{}; - auto last = sorted_intervals[0]; - - for (const auto& [s, e] : sorted_intervals | std::views::drop(1)) { - if (s <= last.end) { - last.end = std::max(last.end, e); - } else { - merged.push_back(last); - last = {s, e}; - } - } - merged.push_back(last); - return merged; - }(); - - // It's important to note that gputime starts from a point which isn't - // equal to the below 'start' var. It looks something like this, where a - // '-' represents CPU time only and '=' represents CPU + GPU. - // - // |---------------------|=========|--------|====|-----------------| - // ^ last_present ^ merged.front().start present ^ - // merged.back().end ^ - // - // I would imagine there would be more GPU than cpu to reach the anti - // lag codepath than is depicted here. We can track the total time - // between vkPresent calls as future_submit - last_submit. The total - // time the GPU spent engaged is the sum of all intervals. So we can - // get a meaningful 'not_gputime' as total - gpu_time. - - const auto gputime = std::ranges::fold_left( - merged, DeviceContext::Clock::time_point_t::duration{}, - [](auto gputime, const auto& interval) { - const auto& [start, end] = interval; - return gputime + (end - start); - }); - - // Our cpu_start value here refers to the time when the CPU was allowed - // to move past the present call and, in theory, begin cpu work on the - // next frame. - const auto cpu_start = [&]() -> auto { - if (const auto it = std::rbegin(this->timings); - it != std::rend(this->timings)) { - - return (*it)->frame.cpu_post_present_time; - } - // This will happen once, only for the first frame. We don't - // have a way of knowing when the CPU first started work here. - // Just return our first submit's start for this edge case. - return frame.submissions.front()->start_handle->get_time_required(); - }(); - - const auto cputime = - frame.submissions.front()->enqueued_time - cpu_start; - - this->timings.emplace_back(std::make_unique<Timing>(Timing{ - .gputime = gputime, - .cputime = cputime, - .frame = frame, - })); - - this->in_flight_frames.pop_front(); - } - - if (const auto T = std::size(this->timings); - T > this->MAX_TRACKED_TIMINGS) { - - const auto erase_to_iter = - std::next(std::begin(this->timings), - static_cast<long>(T - MAX_TRACKED_TIMINGS)); - this->timings.erase(std::begin(this->timings), erase_to_iter); - } -} - -void QueueContext::sleep_in_present() { - // After calling this, any remaining frames are truly in flight. - this->drain_frames_to_timings(); - if (!std::size(this->in_flight_frames)) { - return; - } - - // This is getting the most recent frame and waiting until its start has - // begun. This means that, in the case of >1 frame in flight, it's draining - // all of them before we're allowed to move forward. - const auto first_gpu_work = [&]() -> auto { - const auto& most_recent_frame = this->in_flight_frames.back(); - const auto& first_submission = most_recent_frame.submissions.front(); - return first_submission->start_handle->get_time_spinlock(); - }(); - - // Drain frames again because as stated above, we might have multiple frames - // now completed after our wait spinlock. - this->drain_frames_to_timings(); - - // Check the size again because the frame we want to target may have already - // completed when we called process_frames(). - if (!std::size(this->in_flight_frames)) { - return; - } - assert(std::size(this->in_flight_frames) == 1); - - // Not enough data yet to apply any delays. - if (std::size(this->timings) < this->MAX_TRACKED_TIMINGS) { - return; - } - - const auto calc_median = [&, this](const auto& getter) { - auto vect = std::vector<Timing*>{}; - std::ranges::transform(this->timings, std::back_inserter(vect), - [](const auto& timing) { return timing.get(); }); - std::ranges::sort(vect, [&](const auto& a, const auto& b) { - return getter(a) < getter(b); - }); - return getter(vect[std::size(vect) / 2]); - }; - - const auto expected_gputime = - calc_median([](const auto& timing) { return timing->gputime; }); - const auto expected_cputime = - calc_median([](const auto& timing) { return timing->cputime; }); - - // Should look like this: - // total_length = expected_gputime - // |------------------------x------------------------------| - // ^ first_gpu_work now last_gpu_work ^ - - const auto now = DeviceContext::Clock::now(); - const auto dist = now - first_gpu_work; - const auto expected_dist_to_last = expected_gputime - dist; - - const auto wait_time = expected_dist_to_last - expected_cputime; - - auto& frame = this->in_flight_frames.back(); - const auto& last_gpu_work = frame.submissions.back()->end_handle; - last_gpu_work->get_time_spinlock(now + wait_time); - - frame.cpu_post_present_time = std::chrono::steady_clock::now(); + this->device_context.notify_present(swapchain, iter->second); - std::ofstream f("/tmp/times.txt", std::ios::trunc); - f << " expected gputime: "; - debug_log_time2(f, expected_gputime); - f << " expected cputime: "; - debug_log_time2(f, expected_cputime); - f << " requestd sleep: "; - debug_log_time2(f, wait_time); - f << " observed sleep: "; - debug_log_time2(f, frame.cpu_post_present_time - now); + // Important, we nuke the submission because now it's presented. + this->unpresented_submissions.erase(iter); } bool QueueContext::should_inject_timestamps() const { @@ -385,9 +99,9 @@ bool QueueContext::should_inject_timestamps() const { return false; } - // Don't bother injecting timestamps during queue submission if both AL1 and - // AL2 are disabled. - if (!this->device_context.was_antilag_requested && + // Don't bother injecting timestamps during queue submission if we + // aren't planning on doing anything anyway. + if (!this->device_context.was_capability_requested && !physical_device.instance.layer.is_antilag_1_enabled) { return false; diff --git a/src/queue_context.hh b/src/queue_context.hh index 221626f..48500e1 100644 --- a/src/queue_context.hh +++ b/src/queue_context.hh @@ -2,33 +2,23 @@ #define QUEUE_STATE_HH_ #include "context.hh" -#include "device_context.hh" +#include "device_clock.hh" #include "timestamp_pool.hh" #include <vulkan/utility/vk_dispatch_table.h> #include <vulkan/vulkan.hpp> -#include <chrono> #include <deque> #include <memory> -#include <unordered_set> +#include <unordered_map> namespace low_latency { class QueueContext final : public Context { private: - // The amount of finished frame timing data we keep before eviction. - // For now, this value is also the number of data points used in the - // calculation of gpu timing information. - static constexpr auto MAX_TRACKED_TIMINGS = 50u; // The amount of queue submissions we allow tracked per queue before - // we give up tracking them. For a queue that is presented to, - // these submissions will be constantly moved to Frame structs so - // it's not an issue that we only track so many - unless it just - // happens that an application makes an unexpectedly large - // amount of vkQueueSubmit's per frame. For queues which don't - // present, this limit stops them from growing limitlessly in memory - // as we may not necessarily manually evict them yet. + // we give up tracking them. This is neccessary for queues which do not + // present anything. static constexpr auto MAX_TRACKED_SUBMISSIONS = 50u; public: @@ -59,55 +49,35 @@ class QueueContext final : public Context { // NVIDIA's extension lets the application explicitly state that this queue // does not contribute to the frame. AMD's extension has no such mechanism - - // so this will always be false. + // so this will always be false when using VK_AMD_anti_lag. bool should_ignore_latency = false; public: - // Potentially in flight queue submissions that come from this queue. + // I want our queue bookkeeping to be fairly simple and do one thing - track + // submissions that have yet to have been presented to a swapchain. General + // idea: + // + // For each vkQueueSubmit (specifically for each pSubmitInfo in that + // hook) grab the VK_EXT_present_id value provided by the application for + // that submission. Once we add our timing objects as part of the hook, we + // then take those timing objects, bundle them into a Submission struct, and + // append it to the (potentially currently nonexistent) mapping of + // present_id's to deque<Submission>'s. Now we cleanly track what queue + // submissions refer to what present_id. + // + // When our hook sees a VkQueuePresentKHR, we take the provided present_id + // and notify our device that it needs to watch for when this completes. + // We give it our submission. Now, it's out of our hands. We remove the + // present_id_t mapping when doing so. struct Submission { - const std::unordered_set<VkSemaphore> signals; - const std::unordered_set<VkSemaphore> waits; - - const std::shared_ptr<TimestampPool::Handle> start_handle; - const std::shared_ptr<TimestampPool::Handle> end_handle; - - const DeviceContext::Clock::time_point_t enqueued_time; - }; - using submission_ptr_t = std::shared_ptr<Submission>; - std::deque<submission_ptr_t> submissions; - - // In flight frame submissions grouped together. - // The first element in the deque refers to the first submission that - // contributed to that frame. The last element is the last submission before - // present was called. - // std::size(submissions) >= 1 btw - struct Frame { - std::deque<submission_ptr_t> submissions; - - // the point that control flow was returned from VkQueuePresentKHR back - // to the application. - DeviceContext::Clock::time_point_t cpu_post_present_time; - }; - std::deque<Frame> in_flight_frames; - - // Completed frames. - struct Timing { - DeviceContext::Clock::time_point_t::duration gputime, cputime; - - Frame frame; + std::shared_ptr<TimestampPool::Handle> head_handle, tail_handle; + DeviceClock::time_point_t cpu_present_time; }; - std::deque<std::unique_ptr<Timing>> timings; - private: - // Drains submissions and promotes them into a single frame object. - void drain_submissions_to_frame(); - - // Drains in flight frames and promotes them into a Timing object if they - // have completed. - void drain_frames_to_timings(); - - // Antilag 1 equivalent where we sleep after present to reduce queueing. - void sleep_in_present(); + using submissions_t = + std::shared_ptr<std::deque<std::unique_ptr<Submission>>>; + using present_id_t = std::uint64_t; + std::unordered_map<present_id_t, submissions_t> unpresented_submissions; public: QueueContext(DeviceContext& device_context, const VkQueue& queue, @@ -115,17 +85,13 @@ class QueueContext final : public Context { virtual ~QueueContext(); public: - void notify_submit(const VkSubmitInfo& info, - const std::shared_ptr<TimestampPool::Handle> head_handle, - const std::shared_ptr<TimestampPool::Handle> tail_handle, - const DeviceContext::Clock::time_point_t& now); - - void notify_submit(const VkSubmitInfo2& info, + void notify_submit(const present_id_t& present_id, const std::shared_ptr<TimestampPool::Handle> head_handle, const std::shared_ptr<TimestampPool::Handle> tail_handle, - const DeviceContext::Clock::time_point_t& now); + const DeviceClock::time_point_t& now); - void notify_present(const VkPresentInfoKHR& info); + void notify_present(const VkSwapchainKHR& swapchain, + const std::uint64_t& present_id); public: bool should_inject_timestamps() const; diff --git a/src/swapchain_monitor.cc b/src/swapchain_monitor.cc new file mode 100644 index 0000000..09fa8ba --- /dev/null +++ b/src/swapchain_monitor.cc @@ -0,0 +1,112 @@ +#include "swapchain_monitor.hh" +#include "device_context.hh" + +#include <vulkan/vulkan_core.h> + +#include <functional> +#include <mutex> + +namespace low_latency { + +SwapchainMonitor::SwapchainMonitor(const DeviceContext& device, + const bool was_low_latency_requested) + : device(device), was_low_latency_requested(was_low_latency_requested), + swapchain_worker( + std::bind_front(&SwapchainMonitor::do_swapchain_monitor, this)) {} + +SwapchainMonitor::~SwapchainMonitor() {} + +void SwapchainMonitor::WakeupSemaphore::signal( + const DeviceContext& device) const { + + const auto ssi = + VkSemaphoreSignalInfo{.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO, + .semaphore = this->timeline_semaphore, + .value = this->value}; + THROW_NON_VKSUCCESS(device.vtable.SignalSemaphore(device.device, &ssi)); +} + +void SwapchainMonitor::do_swapchain_monitor(const std::stop_token stoken) { + for (;;) { + auto lock = std::unique_lock{this->mutex}; + this->cv.wait(lock, stoken, + [&]() { return !this->wakeup_semaphores.empty(); }); + + if (stoken.stop_requested()) { + // Small chance an application might need outstanding semaphores + // to be signalled if it's closing to avoid a hang. + break; + } + + // Look for the latest submission and make sure it's completed. + if (!this->in_flight_submissions.empty()) { + const auto submission = this->in_flight_submissions.back(); + this->in_flight_submissions.clear(); + + if (!submission->empty()) { + submission->back()->tail_handle->await_time(); + } + } + + // We might want to signal them all? In theory it's the same timeline + // semaphore so obviously it's redundant to signal them one by one. In + // almost all cases, there should just be one here anyway. + const auto wakeup_semaphore = this->wakeup_semaphores.back(); + wakeup_semaphores.clear(); + + wakeup_semaphore.signal(this->device); + } +} + +void SwapchainMonitor::update_params( + const bool was_low_latency_requested, + const std::chrono::milliseconds present_delay) { + + const auto lock = std::scoped_lock{this->mutex}; + + this->was_low_latency_requested = was_low_latency_requested; + this->present_delay = present_delay; +} + +void SwapchainMonitor::notify_semaphore(const VkSemaphore& timeline_semaphore, + const std::uint64_t& value) { + + const auto lock = std::scoped_lock{this->mutex}; + + const auto wakeup_semaphore = WakeupSemaphore{ + .timeline_semaphore = timeline_semaphore, .value = value}; + + // Signal immediately if low_latency isn't requested or if we have no + // outstanding work. + if (!this->was_low_latency_requested || + this->in_flight_submissions.empty()) { + + wakeup_semaphore.signal(this->device); + return; + } + + this->wakeup_semaphores.emplace_back(timeline_semaphore, value); + this->cv.notify_one(); +} + +void SwapchainMonitor::notify_present( + const QueueContext::submissions_t& submissions) { + + const auto lock = std::scoped_lock{this->mutex}; + + // Fast path where this work has already completed. + if (!this->wakeup_semaphores.empty() && !submissions->empty()) { + + const auto& finished = submissions->back()->tail_handle->get_time(); + if (finished.has_value()) { + this->wakeup_semaphores.back().signal(this->device); + this->wakeup_semaphores.clear(); + return; + } + } + + this->in_flight_submissions.emplace_back(submissions); + this->cv.notify_one(); +} + +} // namespace low_latency
\ No newline at end of file diff --git a/src/swapchain_monitor.hh b/src/swapchain_monitor.hh new file mode 100644 index 0000000..5678630 --- /dev/null +++ b/src/swapchain_monitor.hh @@ -0,0 +1,69 @@ +#ifndef SWAPCHAIN_MONITOR_HH_ +#define SWAPCHAIN_MONITOR_HH_ + +// The purpose of this file is to provide a SwapchainMonitor class definition. + +#include <vulkan/vulkan_core.h> + +#include <chrono> +#include <condition_variable> +#include <deque> +#include <mutex> +#include <thread> + +#include "queue_context.hh" + +namespace low_latency { + +class DeviceContext; + +// A swapchain monitor's job is to provide asynchronous wakeups for threads +// which request low_latency once the previous presentation has completed. +// It does this by signalling a semaphore a la VK_NV_low_latency2. +class SwapchainMonitor { + private: + const DeviceContext& device; + + // Configurarable params for this swapchain. + std::chrono::milliseconds present_delay = std::chrono::milliseconds{0}; + bool was_low_latency_requested = false; + + struct WakeupSemaphore { + VkSemaphore timeline_semaphore; + std::uint64_t value; + + public: + void signal(const DeviceContext& device) const; + }; + std::deque<WakeupSemaphore> wakeup_semaphores; + std::deque<QueueContext::submissions_t> in_flight_submissions; + + std::mutex mutex; + std::condition_variable_any cv; + std::jthread swapchain_worker; + + private: + void do_swapchain_monitor(const std::stop_token stoken); + + public: + SwapchainMonitor(const DeviceContext& device, + const bool was_low_latency_requested); + SwapchainMonitor(const SwapchainMonitor&); + SwapchainMonitor(SwapchainMonitor&&); + SwapchainMonitor operator=(const SwapchainMonitor&); + SwapchainMonitor operator=(SwapchainMonitor&&); + ~SwapchainMonitor(); + + public: + void update_params(const bool was_low_latency_requested, + const std::chrono::milliseconds present_delay); + + void notify_semaphore(const VkSemaphore& timeline_semaphore, + const std::uint64_t& value); + + void notify_present(const QueueContext::submissions_t& submissions); +}; + +} // namespace low_latency + +#endif
\ No newline at end of file diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc index 247d411..a37b2bc 100644 --- a/src/timestamp_pool.cc +++ b/src/timestamp_pool.cc @@ -152,19 +152,18 @@ void TimestampPool::Handle::setup_command_buffers( THROW_NON_VKSUCCESS(vtable.EndCommandBuffer(tail.command_buffer)); } -std::optional<DeviceContext::Clock::time_point_t> -TimestampPool::Handle::get_time() { - const auto& device_ctx = this->timestamp_pool.queue_context.device_context; - const auto& vtable = device_ctx.vtable; +struct QueryResult { + std::uint64_t value; + std::uint64_t available; +}; +std::optional<DeviceClock::time_point_t> TimestampPool::Handle::get_time() { + const auto& context = this->timestamp_pool.queue_context.device_context; + const auto& vtable = context.vtable; - struct QueryResult { - std::uint64_t value; - std::uint64_t available; - }; auto query_result = QueryResult{}; const auto result = vtable.GetQueryPoolResults( - device_ctx.device, query_pool, + context.device, query_pool, static_cast<std::uint32_t>(this->query_index), 1, sizeof(query_result), &query_result, sizeof(query_result), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT); @@ -177,30 +176,31 @@ TimestampPool::Handle::get_time() { return std::nullopt; } - return device_ctx.clock->ticks_to_time(query_result.value); + return context.clock->ticks_to_time(query_result.value); } -std::optional<DeviceContext::Clock::time_point_t> -TimestampPool::Handle::get_time_spinlock( - const DeviceContext::Clock::time_point_t& until) { +DeviceClock::time_point_t TimestampPool::Handle::await_time() { + const auto& context = this->timestamp_pool.queue_context.device_context; + const auto& vtable = context.vtable; - auto time = this->get_time(); - for (; !time.has_value(); time = this->get_time()) { - if (const auto now = DeviceContext::Clock::now(); now >= until) { - break; - } - } - return time; -} + struct QueryResult { + std::uint64_t value; + std::uint64_t available; + }; + auto query_result = QueryResult{}; -DeviceContext::Clock::time_point_t TimestampPool::Handle::get_time_spinlock() { - constexpr auto max = DeviceContext::Clock::time_point_t::max(); - const auto time = this->get_time_spinlock(max); - assert(time.has_value()); - return *time; + THROW_NON_VKSUCCESS(vtable.GetQueryPoolResults( + context.device, query_pool, + static_cast<std::uint32_t>(this->query_index), 1, sizeof(query_result), + &query_result, sizeof(query_result), + VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT | + VK_QUERY_RESULT_WAIT_BIT)); + assert(query_result.available); + + return context.clock->ticks_to_time(query_result.value); } -DeviceContext::Clock::time_point_t TimestampPool::Handle::get_time_required() { +DeviceClock::time_point_t TimestampPool::Handle::get_time_required() { const auto time = this->get_time(); assert(time.has_value()); return *time; diff --git a/src/timestamp_pool.hh b/src/timestamp_pool.hh index 67b34de..0d6c52d 100644 --- a/src/timestamp_pool.hh +++ b/src/timestamp_pool.hh @@ -18,11 +18,12 @@ #include <unordered_set> #include <vector> -#include "device_context.hh" +#include "device_clock.hh" namespace low_latency { class QueueContext; +class DeviceContext; class TimestampPool final { private: @@ -119,20 +120,15 @@ class TimestampPool final { void setup_command_buffers(const Handle& tail, const QueueContext& queue_context) const; - // Attempts to get_time, but returns an optional if it's not available - // yet. - std::optional<DeviceContext::Clock::time_point_t> get_time(); - - // Calls get_time() repeatedly under a spinlock, or gives up at - // time_point_t and returns std::nullopt. - std::optional<DeviceContext::Clock::time_point_t> - get_time_spinlock(const DeviceContext::Clock::time_point_t& until); + public: + // Attempts to get the time - optional if it's not available yet. + std::optional<DeviceClock::time_point_t> get_time(); - // Calls get_time() repeatedly under a spinlock until it's available. - DeviceContext::Clock::time_point_t get_time_spinlock(); + // Waits until the time is available and returns it. + DeviceClock::time_point_t await_time(); // Calls get_time with the assumption it's already available. - DeviceContext::Clock::time_point_t get_time_required(); + DeviceClock::time_point_t get_time_required(); }; public: |
