From 644bc4ed5edd4e3ffa88750bdacb147c75df9546 Mon Sep 17 00:00:00 2001 From: Nicolas James Date: Mon, 30 Mar 2026 15:54:10 +1100 Subject: Fix AL2 via VK_NV_low_latency2's async implementation, fix race in TimestampPool --- src/context.hh | 5 -- src/device_clock.cc | 3 +- src/device_context.cc | 34 ----------- src/device_context.hh | 9 +-- src/helper.hh | 5 ++ src/layer.cc | 132 +++++++++++++++-------------------------- src/layer_context.cc | 7 +-- src/layer_context.hh | 15 ++--- src/physical_device_context.cc | 14 ++--- src/physical_device_context.hh | 10 +--- src/queue_context.cc | 36 +++++------ src/queue_context.hh | 8 +-- src/swapchain_monitor.cc | 18 +++++- src/swapchain_monitor.hh | 4 ++ src/timestamp_pool.cc | 50 ++++++++++------ src/timestamp_pool.hh | 13 ++-- 16 files changed, 157 insertions(+), 206 deletions(-) diff --git a/src/context.hh b/src/context.hh index 6524984..718b697 100644 --- a/src/context.hh +++ b/src/context.hh @@ -3,11 +3,6 @@ namespace low_latency { -#define THROW_NON_VKSUCCESS(x) \ - if (const auto result = x; result != VK_SUCCESS) { \ - throw result; \ - } - // A context class doesn't do much by itself. We just use it to provide a // virtual destructor so we can store a bunch of shared_ptrs in the same // container and rely on RTTI in the layer context. It also deletes the copy and diff --git a/src/device_clock.cc b/src/device_clock.cc index 52c86d3..8e0e408 100644 --- a/src/device_clock.cc +++ b/src/device_clock.cc @@ -1,5 +1,6 @@ #include "device_clock.hh" #include "device_context.hh" +#include "helper.hh" #include @@ -37,7 +38,7 @@ void DeviceClock::calibrate() { }; auto calibrated_result = CalibratedResult{}; - THROW_NON_VKSUCCESS(device.vtable.GetCalibratedTimestampsKHR( + THROW_NOT_VKSUCCESS(device.vtable.GetCalibratedTimestampsKHR( device.device, 2, std::data(infos), &calibrated_result.device, &this->error_bound)); diff --git a/src/device_context.cc b/src/device_context.cc index 5438e40..e2f2a4a 100644 --- a/src/device_context.cc +++ b/src/device_context.cc @@ -28,40 +28,6 @@ DeviceContext::~DeviceContext() { } } -/* -void DeviceContext::sleep_in_input() { - // TODO - - // Present hasn't happened yet, we don't know what queue to attack. - if (!this->present_queue) { - return; - } - - const auto& frames = this->present_queue->in_flight_frames; - // No frame here means we're behind the GPU and do not need to delay. - // If anything we should speed up... - if (!std::size(frames)) { - return; - } - - // If we're here, that means that there might be an outstanding frame that's - // sitting on our present_queue which hasn't yet completed, so we need to - // stall until it's finished. - const auto& last_frame = frames.back(); - assert(std::size(last_frame.submissions)); - const auto& last_frame_submission = last_frame.submissions.back(); - last_frame_submission->end_handle->get_time_spinlock(); - - // From our sleep in present implementation, just spinning until - // the previous frame has completed did not work well. This was because - // there was a delay between presentation and when new work was given - // to the GPU. If we stalled the CPU without trying to account for this, we - // would get huge frame drops, loss of throughput, and the GPU would even - // clock down. So naturally I am concerned about this approach, but it seems - // to perform well so far in my own testing and is just beautifully elegant. -} -*/ - void DeviceContext::update_params( const std::optional target, const std::chrono::milliseconds& present_delay, diff --git a/src/device_context.hh b/src/device_context.hh index 172801c..0e0a4eb 100644 --- a/src/device_context.hh +++ b/src/device_context.hh @@ -24,7 +24,8 @@ class DeviceContext final : public Context { InstanceContext& instance; PhysicalDeviceContext& physical_device; - // Whether or not we were asked to do NV_VK_LowLatency2 or VK_AMD_anti_lag. + // Whether or not we were asked to do NV_VK_LowLatency2 or VK_AMD_anti_lag + // at the device level. const bool was_capability_requested; const VkDevice device; @@ -39,13 +40,13 @@ class DeviceContext final : public Context { public: DeviceContext(InstanceContext& parent_instance, PhysicalDeviceContext& parent_physical, - const VkDevice& device, const bool was_antilag_requested, + const VkDevice& device, const bool was_capability_requested, VkuDeviceDispatchTable&& vtable); virtual ~DeviceContext(); public: - // Updates the settings associated with that swapchain. If none is provided - // all swapchains are set to this value. + // Updates the settings associated with that swapchain. If no swapchain + // target is provided all swapchains are set to this value. void update_params(const std::optional target, const std::chrono::milliseconds& present_delay, const bool was_low_latency_requested); diff --git a/src/helper.hh b/src/helper.hh index 468f146..6dde9be 100644 --- a/src/helper.hh +++ b/src/helper.hh @@ -8,6 +8,11 @@ namespace low_latency { +#define THROW_NOT_VKSUCCESS(x) \ + if (const auto result = x; result != VK_SUCCESS) { \ + throw result; \ + } + // Small templates which allow us to SFINAE find pNext structs. template static T* find_next(void* const head, const VkStructureType& stype) { diff --git a/src/layer.cc b/src/layer.cc index 7a7ffc8..813c267 100644 --- a/src/layer.cc +++ b/src/layer.cc @@ -78,8 +78,6 @@ CreateInstance(const VkInstanceCreateInfo* pCreateInfo, INSTANCE_VTABLE_LOAD(GetPhysicalDeviceQueueFamilyProperties2); INSTANCE_VTABLE_LOAD(GetPhysicalDeviceFeatures2); INSTANCE_VTABLE_LOAD(GetPhysicalDeviceSurfaceCapabilities2KHR); - INSTANCE_VTABLE_LOAD(GetPhysicalDeviceProperties2); - INSTANCE_VTABLE_LOAD(GetPhysicalDeviceProperties2KHR); #undef INSTANCE_VTABLE_LOAD const auto lock = std::scoped_lock{layer_context.mutex}; @@ -102,11 +100,13 @@ DestroyInstance(VkInstance instance, const VkAllocationCallbacks* allocator) { // Erase our physical devices owned by this instance from the global // context. for (const auto& [key, _] : context->phys_devices) { - assert(layer_context.contexts.erase(key)); + assert(layer_context.contexts.contains(key)); + layer_context.contexts.erase(key); } const auto key = layer_context.get_key(instance); - assert(layer_context.contexts.erase(key)); + assert(layer_context.contexts.contains(key)); + layer_context.contexts.erase(key); // Should be the last ptr now like DestroyDevice. assert(context.unique()); @@ -154,23 +154,6 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice( const auto requested = std::unordered_set( std::from_range, enabled_extensions); - // There's the antilag extension that might be requested here - Antilag2. - // Then there's the other thing we provide, which is our AntiLag1 - // equivalent. Calling them AL1 and AL2, where AL1 is requested via - // an env var and AL2 is requested at the device level via the extension, - // the cases where we exit with a bad code or deliberately no-op are: - // - // !SUPPORTED && !AL2 && AL1 -> No-op hooks - // !AL2 && !AL1 -> No-op hooks. - // !SUPPORTED && AL2 -> VK_ERROR_INITIALIZATION_FAILED - // - // Note that even though the user has explicitly enabled AL1 via an env var, - // failing hard here by returning INIT_FAILED if the device doesn't support - // it is wrong. The vulkan application could just be creating a device that - // cannot support it which is unrelated to anything present related. This - // is not the case with AL2, because the vulkan application has to - // explicitly ask for the extension when it creates the device. - const auto was_capability_requested = requested.contains(VK_AMD_ANTI_LAG_EXTENSION_NAME) || requested.contains(VK_NV_LOW_LATENCY_2_EXTENSION_NAME); @@ -204,11 +187,9 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice( // Only append the extra extension if it wasn't already asked for. for (const auto& wanted : PhysicalDeviceContext::required_extensions) { - if (requested.contains(wanted)) { - continue; + if (!requested.contains(wanted)) { + next_extensions.push_back(wanted); } - - next_extensions.push_back(wanted); } return next_extensions; @@ -284,14 +265,16 @@ DestroyDevice(VkDevice device, const VkAllocationCallbacks* allocator) { // Remove all owned queues from our global context pool. for (const auto& [queue, _] : device_context->queues) { const auto key = layer_context.get_key(queue); - assert(layer_context.contexts.erase(key)); + assert(layer_context.contexts.contains(key)); + layer_context.contexts.erase(key); } const auto key = layer_context.get_key(device); - assert(layer_context.contexts.erase(key)); + assert(layer_context.contexts.contains(key)); + layer_context.contexts.erase(key); - // should be the last shared ptr now, so its destructor can be called. - // the destructor should expect its owned queues to be unique as well! + // Should be the last shared ptr now, so its destructor can be called. + // The destructor should expect its owned queues to be unique as well. assert(device_context.unique()); return func; @@ -361,7 +344,7 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count, const VkSubmitInfo* submit_infos, VkFence fence) { const auto context = layer_context.get_context(queue); - const auto& vtable = context->device_context.vtable; + const auto& vtable = context->device.vtable; if (!submit_count || !context->should_inject_timestamps()) { return vtable.QueueSubmit(queue, submit_count, submit_infos, fence); @@ -447,7 +430,7 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count, const VkSubmitInfo2* submit_infos, VkFence fence) { const auto context = layer_context.get_context(queue); - const auto& vtable = context->device_context.vtable; + const auto& vtable = context->device.vtable; if (!submit_count || !context->should_inject_timestamps()) { return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence); @@ -511,7 +494,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) { const auto context = layer_context.get_context(queue); - const auto& vtable = context->device_context.vtable; + const auto& vtable = context->device.vtable; if (const auto res = vtable.QueuePresentKHR(queue, present_info); res != VK_SUCCESS) { @@ -524,7 +507,11 @@ vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) { for (auto i = std::uint32_t{0}; i < present_info->swapchainCount; ++i) { const auto& swapchain = present_info->pSwapchains[i]; + + // For VK_AMD_anti_lag, providing a pPresentId isn't part of the spec. + // So we just set it to 0 if it isn't provided. const auto present_id = pid ? pid->pPresentIds[i] : 0; + context->notify_present(swapchain, present_id); } @@ -549,9 +536,9 @@ static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceExtensionProperties( physical_device, pLayerName, pPropertyCount, pProperties); } - // If we're spoofing nvidia we want to provide their extension instead. + // If we're exposing reflex we want to provide their extension instead. const auto extension_properties = [&]() -> VkExtensionProperties { - if (context->instance.layer.should_spoof_nvidia) { + if (context->instance.layer.should_expose_reflex) { return {.extensionName = VK_NV_LOW_LATENCY_2_EXTENSION_NAME, .specVersion = VK_NV_LOW_LATENCY_2_SPEC_VERSION}; } @@ -561,13 +548,12 @@ static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceExtensionProperties( if (pLayerName) { // This query is for our layer specifically. - - if (!pProperties) { // Querying how much space they need. + if (!pProperties) { *pPropertyCount = 1; return VK_SUCCESS; } - if (!*pPropertyCount) { // They gave us zero space to work with. + if (!*pPropertyCount) { return VK_INCOMPLETE; } @@ -618,8 +604,7 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2( vtable.GetPhysicalDeviceFeatures2(physical_device, pFeatures); // We're going to use this feature for both VK_AMD_anti_lag and - // VK_NV_low_latency2. It simplifies things a bit if we share a code path - // for now. TODO remove it in the future for VK_AMD_anti_lag. + // VK_NV_low_latency2. It simplifies things a bit if we share a code path. if (const auto pidf = find_next( pFeatures, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENT_ID_FEATURES_KHR); @@ -628,10 +613,10 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2( pidf->presentId = true; } - // Don't provide AntiLag if we're trying to spoof nvidia. - // Nvidia uses VkSurfaceCapabilities2KHR to determine if a surface - // is capable of reflex instead of AMD's physical device switch found here. - if (context->instance.layer.should_spoof_nvidia) { + // Don't provide AntiLag if we're exposing reflex - VK_NV_low_latency2 uses + // VkSurfaceCapabilities2KHR to determine if a surface is capable of reflex + // instead of AMD's physical device switch found here. + if (context->instance.layer.should_expose_reflex) { return; } @@ -649,29 +634,6 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2KHR( return GetPhysicalDeviceFeatures2(physical_device, pFeatures); } -static VKAPI_ATTR void VKAPI_CALL -GetPhysicalDeviceProperties2(VkPhysicalDevice physical_device, - VkPhysicalDeviceProperties2* pProperties) { - - const auto context = layer_context.get_context(physical_device); - const auto& vtable = context->instance.vtable; - - vtable.GetPhysicalDeviceProperties2(physical_device, pProperties); - - constexpr auto NVIDIA_VENDOR_ID = 0x10DE; - constexpr auto NVIDIA_DEVICE_ID = 0x2684; // rtx 4080 i think? - if (context->instance.layer.should_spoof_nvidia) { - pProperties->properties.vendorID = NVIDIA_VENDOR_ID; - pProperties->properties.deviceID = NVIDIA_DEVICE_ID; - } -} - -static VKAPI_ATTR void VKAPI_CALL -GetPhysicalDeviceProperties2KHR(VkPhysicalDevice physical_device, - VkPhysicalDeviceProperties2* pProperties) { - return GetPhysicalDeviceProperties2(physical_device, pProperties); -} - static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR( VkPhysicalDevice physical_device, const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo, @@ -684,7 +646,7 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR( physical_device, pSurfaceInfo, pSurfaceCapabilities); // Don't do this unless we're spoofing nvidia. - if (!context->instance.layer.should_spoof_nvidia) { + if (!context->instance.layer.should_expose_reflex) { return; } @@ -742,8 +704,10 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateSwapchainKHR( was_low_latency_requested = slci->latencyModeEnable; } - context->swapchain_monitors.try_emplace(*pSwapchain, *context, - was_low_latency_requested); + + const auto [_, did_emplace] = context->swapchain_monitors.try_emplace( + *pSwapchain, *context, was_low_latency_requested); + assert(did_emplace); return VK_SUCCESS; } @@ -753,7 +717,8 @@ DestroySwapchainKHR(VkDevice device, VkSwapchainKHR swapchain, const VkAllocationCallbacks* pAllocator) { const auto context = layer_context.get_context(device); - assert(context->swapchain_monitors.erase(swapchain)); + assert(context->swapchain_monitors.contains(swapchain)); + context->swapchain_monitors.erase(swapchain); context->vtable.DestroySwapchainKHR(device, swapchain, pAllocator); } @@ -765,9 +730,8 @@ AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) { // AL2 is a synchronous while NVIDIA's low_latencty2 is asynchronous. // It's difficult to model an asynchronous impl inside a synchronous impl, - // but it's easy to do the inverse. As a result, we should implement - // NVIDIA's method and then have a working AL2 implementation follow using - // that existing code path. + // but it's easy to do the inverse. AMD's extension piggybacks on NVIDIA's + // more complicated implementation. const auto present_delay = [&]() { using namespace std::chrono; @@ -777,12 +741,18 @@ AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) { context->update_params(std::nullopt, present_delay, (pData->mode == VK_ANTI_LAG_MODE_ON_AMD)); - if (!pData->pPresentationInfo) { + if (!pData->pPresentationInfo || + pData->pPresentationInfo->stage != VK_ANTI_LAG_STAGE_INPUT_AMD) { + return; } - if (pData->pPresentationInfo->stage == VK_ANTI_LAG_STAGE_INPUT_AMD) { - // TODO use nvidia's path + // VK_AMD_anti_lag doesn't provide a swapchain, so we can't map it to + // a queue. Our previous implementation used the last queue that presented + // and made sure that at least that one completed. I think it's more robust + // to make sure they all complete. + for (auto& iter : context->swapchain_monitors) { + iter.second.wait_until(); } } @@ -832,21 +802,18 @@ VkResult SetLatencySleepModeNV(VkDevice device, VkSwapchainKHR swapchain, // If pSleepModeInfo is nullptr, it means no delay and no low latency. context->update_params(swapchain, std::chrono::milliseconds{0}, false); } + return VK_SUCCESS; } void SetLatencyMarkerNV(VkDevice device, VkSwapchainKHR swapchain, const VkSetLatencyMarkerInfoNV* pLatencyMarkerInfo) { // STUB - // We will probably end up making use of this in the future, but afaict it's - // not relevant for this layer's operation just yet. This function is - // NVIDIA's way of giving developers insight into their render pipeline. } void GetLatencyTimingsNV(VkDevice device, VkSwapchainKHR swapchain, VkGetLatencyMarkerInfoNV* pLatencyMarkerInfo) { // STUB - // Just like SetLatencyMarkerNV this isn't relevant for us just yet. } } // namespace low_latency @@ -907,11 +874,6 @@ static const auto instance_functions = func_map_t{ HOOK_ENTRY("vkGetPhysicalDeviceSurfaceCapabilities2KHR", low_latency::GetPhysicalDeviceSurfaceCapabilities2KHR), - - HOOK_ENTRY("vkGetPhysicalDeviceProperties2", - low_latency::GetPhysicalDeviceProperties2), - HOOK_ENTRY("vkGetPhysicalDeviceProperties2KHR", - low_latency::GetPhysicalDeviceProperties2KHR), }; static const auto device_functions = func_map_t{ diff --git a/src/layer_context.cc b/src/layer_context.cc index 4699202..4399338 100644 --- a/src/layer_context.cc +++ b/src/layer_context.cc @@ -4,15 +4,14 @@ #include namespace low_latency { - + LayerContext::LayerContext() { const auto parse_bool_env = [](const auto& name) -> bool { const auto env = std::getenv(name); return env && std::string_view{env} == "1"; }; - - this->is_antilag_1_enabled = parse_bool_env(SLEEP_AFTER_PRESENT_ENV); - this->should_spoof_nvidia = parse_bool_env(SPOOF_NVIDIA_ENV); + + this->should_expose_reflex = parse_bool_env(EXPOSE_REFLEX_ENV); } LayerContext::~LayerContext() {} diff --git a/src/layer_context.hh b/src/layer_context.hh index 95f1cd5..731b273 100644 --- a/src/layer_context.hh +++ b/src/layer_context.hh @@ -48,22 +48,15 @@ using dispatch_context_t = typename context_for_t::context; class LayerContext final : public Context { private: - // If this is not null and set to exactly 1, then we should sleep after - // present. - static constexpr auto SLEEP_AFTER_PRESENT_ENV = - "LOW_LATENCY_LAYER_SLEEP_AFTER_PRESENT"; - - // If this is not null and set to exactly 1, then VK_NV_low_latency2 - // should be provided instead of VK_AMD_anti_lag. - static constexpr auto SPOOF_NVIDIA_ENV = - "LOW_LATENCY_LAYER_SPOOF_NVIDIA"; + // If this is not null and set to 1 then VK_NV_low_latency2 should be + // provided instead of VK_AMD_anti_lag. + static constexpr auto EXPOSE_REFLEX_ENV = "LOW_LATENCY_LAYER_EXPOSE_REFLEX"; public: std::mutex mutex; std::unordered_map> contexts; - bool is_antilag_1_enabled = false; - bool should_spoof_nvidia = false; + bool should_expose_reflex = false; public: LayerContext(); diff --git a/src/physical_device_context.cc b/src/physical_device_context.cc index 9c4ad8e..86bf9ab 100644 --- a/src/physical_device_context.cc +++ b/src/physical_device_context.cc @@ -1,4 +1,5 @@ #include "physical_device_context.hh" +#include "helper.hh" #include @@ -26,25 +27,24 @@ PhysicalDeviceContext::PhysicalDeviceContext( vtable.GetPhysicalDeviceQueueFamilyProperties2(physical_device, &count, nullptr); - using qp_t = PhysicalDeviceContext::queue_properties_t; - auto result = qp_t( + auto result = std::vector( count, VkQueueFamilyProperties2{ .sType = VK_STRUCTURE_TYPE_QUEUE_FAMILY_PROPERTIES_2}); vtable.GetPhysicalDeviceQueueFamilyProperties2(physical_device, &count, std::data(result)); - return std::make_unique(std::move(result)); + return std::make_unique>( + std::move(result)); }(); this->supports_required_extensions = [&]() { auto count = std::uint32_t{}; - THROW_NON_VKSUCCESS(vtable.EnumerateDeviceExtensionProperties( + THROW_NOT_VKSUCCESS(vtable.EnumerateDeviceExtensionProperties( physical_device, nullptr, &count, nullptr)); auto supported_extensions = std::vector(count); - THROW_NON_VKSUCCESS(vtable.EnumerateDeviceExtensionProperties( - physical_device, nullptr, &count, - std::data(supported_extensions))); + THROW_NOT_VKSUCCESS(vtable.EnumerateDeviceExtensionProperties( + physical_device, nullptr, &count, std::data(supported_extensions))); const auto supported = supported_extensions | diff --git a/src/physical_device_context.hh b/src/physical_device_context.hh index f7ad289..d2e094e 100644 --- a/src/physical_device_context.hh +++ b/src/physical_device_context.hh @@ -23,16 +23,12 @@ class PhysicalDeviceContext final : public Context { public: InstanceContext& instance; - const VkPhysicalDevice physical_device; - std::unique_ptr properties; - - using queue_properties_t = std::vector; - std::unique_ptr queue_properties; + std::unique_ptr properties; + std::unique_ptr> queue_properties; - // Will be true if the physical device supports everything in - // this->required_extensions. + // Will be true if the physical device supports all of required_extensions. bool supports_required_extensions = false; public: diff --git a/src/queue_context.cc b/src/queue_context.cc index 30e73c1..1192bb6 100644 --- a/src/queue_context.cc +++ b/src/queue_context.cc @@ -1,44 +1,43 @@ #include "queue_context.hh" #include "device_context.hh" -#include "layer_context.hh" +#include "helper.hh" #include "timestamp_pool.hh" #include + #include namespace low_latency { -QueueContext::CommandPoolOwner::CommandPoolOwner( - const QueueContext& queue_context) - : queue_context(queue_context) { +QueueContext::CommandPoolOwner::CommandPoolOwner(const QueueContext& queue) + : queue(queue) { - const auto& device_context = this->queue_context.device_context; + const auto& device_context = this->queue.device; const auto cpci = VkCommandPoolCreateInfo{ .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, - .queueFamilyIndex = queue_context.queue_family_index, + .queueFamilyIndex = queue.queue_family_index, }; - THROW_NON_VKSUCCESS(device_context.vtable.CreateCommandPool( + THROW_NOT_VKSUCCESS(device_context.vtable.CreateCommandPool( device_context.device, &cpci, nullptr, &this->command_pool)); } QueueContext::CommandPoolOwner::~CommandPoolOwner() { - const auto& device_context = this->queue_context.device_context; + const auto& device_context = this->queue.device; device_context.vtable.DestroyCommandPool(device_context.device, this->command_pool, nullptr); } -QueueContext::QueueContext(DeviceContext& device_context, const VkQueue& queue, +QueueContext::QueueContext(DeviceContext& device, const VkQueue& queue, const std::uint32_t& queue_family_index) - : device_context(device_context), queue(queue), - queue_family_index(queue_family_index), + : device(device), queue(queue), queue_family_index(queue_family_index), command_pool(std::make_unique(*this)) { // Only construct a timestamp pool if we support it! - if (device_context.physical_device.supports_required_extensions) { + if (device.physical_device.supports_required_extensions) { this->timestamp_pool = std::make_unique(*this); } } @@ -77,7 +76,6 @@ void QueueContext::notify_submit( void QueueContext::notify_present(const VkSwapchainKHR& swapchain, const present_id_t& present_id) { - // Notify the device that this swapchain was just presented to. // We're avoiding a double hash here - don't use operator[] and erase. auto iter = this->unpresented_submissions.try_emplace(present_id).first; @@ -86,24 +84,28 @@ void QueueContext::notify_present(const VkSwapchainKHR& swapchain, std::make_shared>>(); } - this->device_context.notify_present(swapchain, iter->second); + this->device.notify_present(swapchain, iter->second); // Important, we nuke the submission because now it's presented. this->unpresented_submissions.erase(iter); } bool QueueContext::should_inject_timestamps() const { - const auto& physical_device = this->device_context.physical_device; + const auto& physical_device = this->device.physical_device; + // Our layer is a no-op here if we don't support it. if (!physical_device.supports_required_extensions) { return false; } // Don't bother injecting timestamps during queue submission if we // aren't planning on doing anything anyway. - if (!this->device_context.was_capability_requested && - !physical_device.instance.layer.is_antilag_1_enabled) { + if (!this->device.was_capability_requested) { + return false; + } + // Don't do it if we've been marked as 'out of band' by nvidia's extension. + if (this->should_ignore_latency) { return false; } diff --git a/src/queue_context.hh b/src/queue_context.hh index 48500e1..a52e718 100644 --- a/src/queue_context.hh +++ b/src/queue_context.hh @@ -22,18 +22,18 @@ class QueueContext final : public Context { static constexpr auto MAX_TRACKED_SUBMISSIONS = 50u; public: - DeviceContext& device_context; + DeviceContext& device; const VkQueue queue; const std::uint32_t queue_family_index; struct CommandPoolOwner final { private: - const QueueContext& queue_context; + const QueueContext& queue; VkCommandPool command_pool; public: - CommandPoolOwner(const QueueContext& queue_context); + CommandPoolOwner(const QueueContext& queue); CommandPoolOwner(const CommandPoolOwner&) = delete; CommandPoolOwner(CommandPoolOwner&&) = delete; CommandPoolOwner operator=(const CommandPoolOwner&) = delete; @@ -67,7 +67,7 @@ class QueueContext final : public Context { // // When our hook sees a VkQueuePresentKHR, we take the provided present_id // and notify our device that it needs to watch for when this completes. - // We give it our submission. Now, it's out of our hands. We remove the + // We give it our submissions. Now, it's out of our hands. We remove the // present_id_t mapping when doing so. struct Submission { std::shared_ptr head_handle, tail_handle; diff --git a/src/swapchain_monitor.cc b/src/swapchain_monitor.cc index 09fa8ba..adeb315 100644 --- a/src/swapchain_monitor.cc +++ b/src/swapchain_monitor.cc @@ -1,5 +1,6 @@ #include "swapchain_monitor.hh" #include "device_context.hh" +#include "helper.hh" #include @@ -23,7 +24,7 @@ void SwapchainMonitor::WakeupSemaphore::signal( VkSemaphoreSignalInfo{.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO, .semaphore = this->timeline_semaphore, .value = this->value}; - THROW_NON_VKSUCCESS(device.vtable.SignalSemaphore(device.device, &ssi)); + THROW_NOT_VKSUCCESS(device.vtable.SignalSemaphore(device.device, &ssi)); } void SwapchainMonitor::do_swapchain_monitor(const std::stop_token stoken) { @@ -109,4 +110,19 @@ void SwapchainMonitor::notify_present( this->cv.notify_one(); } +void SwapchainMonitor::wait_until() { + // No reason to lock when using VK_AMD_anti_lag. + if (this->in_flight_submissions.empty()) { + return; + } + + const auto last_submissions = this->in_flight_submissions.back(); + this->in_flight_submissions.clear(); + if (last_submissions->empty()) { + return; + } + + last_submissions->back()->tail_handle->await_time(); +} + } // namespace low_latency \ No newline at end of file diff --git a/src/swapchain_monitor.hh b/src/swapchain_monitor.hh index 5678630..be81d59 100644 --- a/src/swapchain_monitor.hh +++ b/src/swapchain_monitor.hh @@ -62,6 +62,10 @@ class SwapchainMonitor { const std::uint64_t& value); void notify_present(const QueueContext::submissions_t& submissions); + + public: + // Synchronously wait until all in-flight submissions have completed. + void wait_until(); }; } // namespace low_latency diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc index a37b2bc..4bb236b 100644 --- a/src/timestamp_pool.cc +++ b/src/timestamp_pool.cc @@ -1,7 +1,9 @@ #include "timestamp_pool.hh" #include "device_context.hh" +#include "helper.hh" #include "queue_context.hh" +#include #include #include #include @@ -13,18 +15,18 @@ TimestampPool::QueryChunk::QueryPoolOwner::QueryPoolOwner( const QueueContext& queue_context) : queue_context(queue_context) { - const auto& device_context = this->queue_context.device_context; + const auto& device_context = this->queue_context.device; const auto qpci = VkQueryPoolCreateInfo{.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, .queryType = VK_QUERY_TYPE_TIMESTAMP, .queryCount = QueryChunk::CHUNK_SIZE}; - THROW_NON_VKSUCCESS(device_context.vtable.CreateQueryPool( + THROW_NOT_VKSUCCESS(device_context.vtable.CreateQueryPool( device_context.device, &qpci, nullptr, &this->query_pool)); } TimestampPool::QueryChunk::QueryPoolOwner::~QueryPoolOwner() { - const auto& device_context = this->queue_context.device_context; + const auto& device_context = this->queue_context.device; device_context.vtable.DestroyQueryPool(device_context.device, this->query_pool, nullptr); } @@ -43,7 +45,7 @@ TimestampPool::QueryChunk::CommandBuffersOwner::CommandBuffersOwner( const QueueContext& queue_context) : queue_context(queue_context), command_buffers(CHUNK_SIZE) { - const auto& device_context = queue_context.device_context; + const auto& device_context = queue_context.device; const auto cbai = VkCommandBufferAllocateInfo{ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, @@ -51,12 +53,12 @@ TimestampPool::QueryChunk::CommandBuffersOwner::CommandBuffersOwner( .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, .commandBufferCount = CHUNK_SIZE, }; - THROW_NON_VKSUCCESS(device_context.vtable.AllocateCommandBuffers( + THROW_NOT_VKSUCCESS(device_context.vtable.AllocateCommandBuffers( device_context.device, &cbai, std::data(this->command_buffers))); } TimestampPool::QueryChunk::CommandBuffersOwner::~CommandBuffersOwner() { - const auto& device_context = this->queue_context.device_context; + const auto& device_context = this->queue_context.device; device_context.vtable.FreeCommandBuffers( device_context.device, *this->queue_context.command_pool, @@ -64,6 +66,13 @@ TimestampPool::QueryChunk::CommandBuffersOwner::~CommandBuffersOwner() { std::data(this->command_buffers)); } +VkCommandBuffer TimestampPool::QueryChunk::CommandBuffersOwner::operator[]( + const std::size_t& i) { + + assert(i < CHUNK_SIZE); + return this->command_buffers[i]; +} + TimestampPool::QueryChunk::~QueryChunk() {} TimestampPool::TimestampPool(QueueContext& queue_context) @@ -75,6 +84,7 @@ TimestampPool::TimestampPool(QueueContext& queue_context) } std::shared_ptr TimestampPool::acquire() { + const auto lock = std::scoped_lock{this->mutex}; // Gets the empty one, or inserts a new one and returns it. const auto not_empty_iter = [this]() -> auto { @@ -97,12 +107,12 @@ std::shared_ptr TimestampPool::acquire() { // Grab any element from our set and erase it immediately after. auto& indices = *(*not_empty_iter)->free_indices; const auto query_index = *std::begin(indices); - assert(indices.erase(query_index)); + indices.erase(query_index); return std::make_shared(*this, *not_empty_iter, query_index); } -TimestampPool::Handle::Handle(const TimestampPool& timestamp_pool, +TimestampPool::Handle::Handle(TimestampPool& timestamp_pool, const std::shared_ptr& origin_chunk, const std::uint64_t& query_index) : timestamp_pool(timestamp_pool), origin_chunk(origin_chunk), @@ -110,10 +120,12 @@ TimestampPool::Handle::Handle(const TimestampPool& timestamp_pool, command_buffer((*origin_chunk->command_buffers)[query_index]) {} TimestampPool::Handle::~Handle() { + const auto lock = std::scoped_lock{this->timestamp_pool.mutex}; + // Parent destructing shouldn't mean we should have a bunch of // insertions for zero reason. if (const auto ptr = this->origin_chunk.lock(); ptr) { - assert(ptr->free_indices->insert(this->query_index).second); + ptr->free_indices->insert(this->query_index); } } @@ -124,32 +136,32 @@ void TimestampPool::Handle::setup_command_buffers( .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, }; - const auto& device_context = queue_context.device_context; + const auto& device_context = queue_context.device; const auto& vtable = device_context.vtable; vtable.ResetQueryPoolEXT(device_context.device, this->query_pool, static_cast(this->query_index), 1); - THROW_NON_VKSUCCESS(vtable.ResetCommandBuffer(this->command_buffer, 0)); - THROW_NON_VKSUCCESS(vtable.BeginCommandBuffer(this->command_buffer, &cbbi)); + THROW_NOT_VKSUCCESS(vtable.ResetCommandBuffer(this->command_buffer, 0)); + THROW_NOT_VKSUCCESS(vtable.BeginCommandBuffer(this->command_buffer, &cbbi)); vtable.CmdWriteTimestamp2KHR( this->command_buffer, VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, this->query_pool, static_cast(this->query_index)); - THROW_NON_VKSUCCESS(vtable.EndCommandBuffer(this->command_buffer)); + THROW_NOT_VKSUCCESS(vtable.EndCommandBuffer(this->command_buffer)); vtable.ResetQueryPoolEXT(device_context.device, tail.query_pool, static_cast(tail.query_index), 1); - THROW_NON_VKSUCCESS(vtable.ResetCommandBuffer(tail.command_buffer, 0)); - THROW_NON_VKSUCCESS(vtable.BeginCommandBuffer(tail.command_buffer, &cbbi)); + THROW_NOT_VKSUCCESS(vtable.ResetCommandBuffer(tail.command_buffer, 0)); + THROW_NOT_VKSUCCESS(vtable.BeginCommandBuffer(tail.command_buffer, &cbbi)); vtable.CmdWriteTimestamp2KHR( tail.command_buffer, VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT, tail.query_pool, static_cast(tail.query_index)); - THROW_NON_VKSUCCESS(vtable.EndCommandBuffer(tail.command_buffer)); + THROW_NOT_VKSUCCESS(vtable.EndCommandBuffer(tail.command_buffer)); } struct QueryResult { @@ -157,7 +169,7 @@ struct QueryResult { std::uint64_t available; }; std::optional TimestampPool::Handle::get_time() { - const auto& context = this->timestamp_pool.queue_context.device_context; + const auto& context = this->timestamp_pool.queue_context.device; const auto& vtable = context.vtable; auto query_result = QueryResult{}; @@ -180,7 +192,7 @@ std::optional TimestampPool::Handle::get_time() { } DeviceClock::time_point_t TimestampPool::Handle::await_time() { - const auto& context = this->timestamp_pool.queue_context.device_context; + const auto& context = this->timestamp_pool.queue_context.device; const auto& vtable = context.vtable; struct QueryResult { @@ -189,7 +201,7 @@ DeviceClock::time_point_t TimestampPool::Handle::await_time() { }; auto query_result = QueryResult{}; - THROW_NON_VKSUCCESS(vtable.GetQueryPoolResults( + THROW_NOT_VKSUCCESS(vtable.GetQueryPoolResults( context.device, query_pool, static_cast(this->query_index), 1, sizeof(query_result), &query_result, sizeof(query_result), diff --git a/src/timestamp_pool.hh b/src/timestamp_pool.hh index 0d6c52d..d8ee359 100644 --- a/src/timestamp_pool.hh +++ b/src/timestamp_pool.hh @@ -3,7 +3,7 @@ // The purpose of this file is to provide the definition of a 'timestamp pool'. // It manages blocks of timestamp query pools, hands them out when requested, -// and allocates more when (if) we run out. +// and allocates more when (if) we run out. It _should_ be thread safe. // Usage: // 1. Get handle with .acquire(). // 2. Write start/end timestamp operations with the handle's pool and index @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -28,6 +29,7 @@ class DeviceContext; class TimestampPool final { private: QueueContext& queue_context; + std::mutex mutex; // A chunk of data which is useful for making timestamp queries. // Allows association of an index to a query pool and command buffer. @@ -72,10 +74,7 @@ class TimestampPool final { ~CommandBuffersOwner(); public: - VkCommandBuffer operator[](const std::size_t& i) { - assert(i < CHUNK_SIZE); - return this->command_buffers[i]; - } + VkCommandBuffer operator[](const std::size_t& i); }; std::unique_ptr command_buffers; @@ -98,7 +97,7 @@ class TimestampPool final { friend class TimestampPool; private: - const TimestampPool& timestamp_pool; + TimestampPool& timestamp_pool; const std::weak_ptr origin_chunk; public: @@ -107,7 +106,7 @@ class TimestampPool final { const VkCommandBuffer command_buffer; public: - Handle(const TimestampPool& timestamp_pool, + Handle(TimestampPool& timestamp_pool, const std::shared_ptr& origin_chunk, const std::uint64_t& query_index); Handle(const Handle& handle) = delete; -- cgit v1.2.3