diff options
| author | Nicolas James <nj3ahxac@gmail.com> | 2026-03-26 19:50:19 +1100 |
|---|---|---|
| committer | Nicolas James <nj3ahxac@gmail.com> | 2026-03-26 19:50:19 +1100 |
| commit | 4ab8c7b97ea513e209705907afce9852934a7d86 (patch) | |
| tree | 558b7e5316de7c6cd1f15ee0920dfb3ea815fd92 /src | |
| parent | 7810f837b3cfa7dc45f360282cd1cafb2c3e6ca6 (diff) | |
Implement QueueNotifyOutOfBandNV, SetLatencySleepModeNV - also some bookkeeping
Diffstat (limited to 'src')
| -rw-r--r-- | src/device_context.cc | 39 | ||||
| -rw-r--r-- | src/device_context.hh | 24 | ||||
| -rw-r--r-- | src/layer.cc | 116 | ||||
| -rw-r--r-- | src/layer_context.hh | 6 | ||||
| -rw-r--r-- | src/queue_context.cc | 9 | ||||
| -rw-r--r-- | src/queue_context.hh | 5 |
6 files changed, 126 insertions, 73 deletions
diff --git a/src/device_context.cc b/src/device_context.cc index cea0540..58737e2 100644 --- a/src/device_context.cc +++ b/src/device_context.cc @@ -1,5 +1,4 @@ #include "device_context.hh" -#include "queue_context.hh" #include <time.h> #include <utility> @@ -23,7 +22,6 @@ DeviceContext::DeviceContext(InstanceContext& parent_instance, } DeviceContext::~DeviceContext() { - this->present_queue.reset(); // We will let the destructor handle clearing here, but they should be // unique by now (ie, removed from the layer's context map). for (const auto& [queue, queue_context] : this->queues) { @@ -94,6 +92,9 @@ DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const { } void DeviceContext::sleep_in_input() { + // TODO + + /* // Present hasn't happened yet, we don't know what queue to attack. if (!this->present_queue) { return; @@ -121,32 +122,30 @@ void DeviceContext::sleep_in_input() { // would get huge frame drops, loss of throughput, and the GPU would even // clock down. So naturally I am concerned about this approach, but it seems // to perform well so far in my own testing and is just beautifully elegant. + */ } -void DeviceContext::notify_antilag_update(const VkAntiLagDataAMD& data) { - this->antilag_mode = data.mode; - this->antilag_fps = data.maxFPS; // TODO +void DeviceContext::update_swapchain_infos( + const std::optional<VkSwapchainKHR> target, + const std::chrono::milliseconds& present_delay, + const bool was_low_latency_requested) { - // This might not be provided (probably just to set some settings?). - if (!data.pPresentationInfo) { - return; - } + const auto write = SwapchainInfo{ + .present_delay = present_delay, + .was_low_latency_requested = was_low_latency_requested, + }; - // Only care about the input stage for now. - if (data.pPresentationInfo->stage != VK_ANTI_LAG_STAGE_INPUT_AMD) { + if (target.has_value()) { + const auto iter = this->swapchain_infos.find(*target); + assert(iter != std::end(this->swapchain_infos)); // Must exist (spec). + iter->second = write; return; } - if (this->antilag_mode != VK_ANTI_LAG_MODE_ON_AMD) { - return; + // If we don't have a target (AMD's anti_lag), just write it to everything. + for (auto& iter : this->swapchain_infos) { + iter.second = write; } - - this->sleep_in_input(); -} - -void DeviceContext::notify_queue_present(const QueueContext& queue) { - assert(this->queues.contains(queue.queue)); - this->present_queue = this->queues[queue.queue]; } } // namespace low_latency
\ No newline at end of file diff --git a/src/device_context.hh b/src/device_context.hh index c76f376..6b5f000 100644 --- a/src/device_context.hh +++ b/src/device_context.hh @@ -28,6 +28,13 @@ struct DeviceContext final : public Context { const VkDevice device; const VkuDeviceDispatchTable vtable; + // Tiny struct to represent any swapchain's low latency state. + struct SwapchainInfo { + std::chrono::milliseconds present_delay = std::chrono::milliseconds{0}; + bool was_low_latency_requested = false; + }; + std::unordered_map<VkSwapchainKHR, SwapchainInfo> swapchain_infos{}; + std::unordered_map<VkQueue, std::shared_ptr<QueueContext>> queues; struct Clock { @@ -58,15 +65,6 @@ struct DeviceContext final : public Context { }; std::unique_ptr<Clock> clock; - std::uint32_t antilag_fps = 0; // TODO - VkAntiLagModeAMD antilag_mode = VK_ANTI_LAG_MODE_DRIVER_CONTROL_AMD; - - // The queue used in the last present. - std::shared_ptr<QueueContext> present_queue; - - private: - void sleep_in_input(); - public: DeviceContext(InstanceContext& parent_instance, PhysicalDeviceContext& parent_physical, @@ -75,9 +73,13 @@ struct DeviceContext final : public Context { virtual ~DeviceContext(); public: - void notify_antilag_update(const VkAntiLagDataAMD& data); + void sleep_in_input(); - void notify_queue_present(const QueueContext& queue); + // Updates the settings associated with that swapchain. If none is provided + // all swapchains are set to this value. + void update_swapchain_infos(const std::optional<VkSwapchainKHR> target, + const std::chrono::milliseconds& present_delay, + const bool was_low_latency_requested); }; }; // namespace low_latency diff --git a/src/layer.cc b/src/layer.cc index 77ce296..438f331 100644 --- a/src/layer.cc +++ b/src/layer.cc @@ -295,6 +295,8 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice( DEVICE_VTABLE_LOAD(GetCalibratedTimestampsKHR); DEVICE_VTABLE_LOAD(ResetQueryPoolEXT); DEVICE_VTABLE_LOAD(SignalSemaphore); + DEVICE_VTABLE_LOAD(CreateSwapchainKHR); + DEVICE_VTABLE_LOAD(DestroySwapchainKHR); #undef DEVICE_VTABLE_LOAD const auto key = layer_context.get_key(*pDevice); @@ -735,17 +737,57 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR( lsc->presentModeCount = num_to_write; } +static VKAPI_ATTR VkResult VKAPI_CALL CreateSwapchainKHR( + VkDevice device, const VkSwapchainCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, VkSwapchainKHR* pSwapchain) { + + const auto context = layer_context.get_context(device); + + if (const auto result = context->vtable.CreateSwapchainKHR( + device, pCreateInfo, pAllocator, pSwapchain); + result != VK_SUCCESS) { + + return result; + } + + assert(context->swapchain_infos.try_emplace(*pSwapchain).second); + + return VK_SUCCESS; +} + +static VKAPI_ATTR void VKAPI_CALL +DestroySwapchainKHR(VkDevice device, VkSwapchainKHR swapchain, + const VkAllocationCallbacks* pAllocator) { + const auto context = layer_context.get_context(device); + + assert(context->swapchain_infos.erase(swapchain)); + + context->vtable.DestroySwapchainKHR(device, swapchain, pAllocator); +} + static VKAPI_ATTR void VKAPI_CALL AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) { const auto context = layer_context.get_context(device); assert(pData); - context->notify_antilag_update(*pData); -} -// Stubs for nvidia low latency 2. -void GetLatencyTimingsNV(VkDevice device, VkSwapchainKHR swapchain, - VkGetLatencyMarkerInfoNV* pLatencyMarkerInfo) { - // STUB + // AL2 is a synchronous while NVIDIA's low_latencty2 is asynchronous. + // It's difficult to model an asynchronous impl inside a synchronous impl, + // but it's easy to do the inverse. As a result, we should implement + // NVIDIA's method and then have a working AL2 implementation follow using + // that existing code path. + + using namespace std::chrono; + const auto present_delay = duration_cast<milliseconds>(1s / pData->maxFPS); + context->update_swapchain_infos(std::nullopt, present_delay, + (pData->mode == VK_ANTI_LAG_MODE_ON_AMD)); + + if (!pData->pPresentationInfo) { + return; + } + + if (pData->pPresentationInfo->stage == VK_ANTI_LAG_STAGE_INPUT_AMD) { + context->sleep_in_input(); + } } VkResult LatencySleepNV(VkDevice device, VkSwapchainKHR swapchain, @@ -754,50 +796,54 @@ VkResult LatencySleepNV(VkDevice device, VkSwapchainKHR swapchain, const auto context = layer_context.get_context(device); assert(pSleepInfo); - // Keep going. - if (pSleepInfo->signalSemaphore) { - - // This is a hack obviously. I will have to associate queue submits with - // a semaphore and signal it correctly later. I'm not sure about the - // implications regarding multithreading, will have to think a bit about how to do this cleanly - // with our current anti lag. - static std::uint32_t counter = 1024; - - const auto ssi = VkSemaphoreSignalInfo{ - .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, - .semaphore = pSleepInfo->signalSemaphore, - .value = counter, - }; + // TODO sleep here - // So we don't wait and this becomes a no-op instead of a freeze! - context->vtable.SignalSemaphore(device, &ssi); - - ++counter; - } - // STUB return VK_SUCCESS; } void QueueNotifyOutOfBandNV(VkQueue queue, const VkOutOfBandQueueTypeInfoNV* pQueueTypeInfo) { - // STUB -} + // This is really thoughtful from NVIDIA. Having the application explicitly + // state which queues should be ignored for latency evaluation is far + // superior to AMD's guessing game. + // Kind of interesting how you can't turn it back on once it's turned off. + // Also I really have no idea why pQueueTypeInfo's VkOutOfBandQueueTypeNV + // enum even exists (I guess we will find out later when nothing works). + const auto context = layer_context.get_context(queue); -void SetLatencyMarkerNV(VkDevice device, VkSwapchainKHR swapchain, - const VkSetLatencyMarkerInfoNV* pLatencyMarkerInfo) { - // STUB + context->should_ignore_latency = true; } VkResult SetLatencySleepModeNV(VkDevice device, VkSwapchainKHR swapchain, const VkLatencySleepModeInfoNV* pSleepModeInfo) { - const auto context = layer_context.get_context(device); - assert(pSleepModeInfo); - // STUB + using namespace std::chrono; + if (pSleepModeInfo) { + context->update_swapchain_infos( + swapchain, milliseconds{pSleepModeInfo->minimumIntervalUs}, + pSleepModeInfo->lowLatencyMode); + } else { + // If pSleepModeInfo is nullptr, it means no delay and no low latency. + context->update_swapchain_infos(swapchain, milliseconds{0}, false); + } return VK_SUCCESS; } +void SetLatencyMarkerNV(VkDevice device, VkSwapchainKHR swapchain, + const VkSetLatencyMarkerInfoNV* pLatencyMarkerInfo) { + // STUB + // We will probably end up making use of this in the future, but afaict it's + // not relevant for this layer's operation just yet. This function is + // NVIDIA's way of giving developers insight into their render pipeline. +} + +void GetLatencyTimingsNV(VkDevice device, VkSwapchainKHR swapchain, + VkGetLatencyMarkerInfoNV* pLatencyMarkerInfo) { + // STUB + // Just like SetLatencyMarkerNV this isn't relevant for us just yet. +} + } // namespace low_latency // This is a bit of template hackery which generates a wrapper function for each @@ -885,6 +931,8 @@ static const auto device_functions = func_map_t{ HOOK_ENTRY("vkSetLatencyMarkerNV", low_latency::SetLatencyMarkerNV), HOOK_ENTRY("vkSetLatencySleepModeNV", low_latency::SetLatencySleepModeNV), + HOOK_ENTRY("vkCreateSwapchainKHR", low_latency::CreateSwapchainKHR), + HOOK_ENTRY("vkDestroySwapchainKHR", low_latency::DestroySwapchainKHR), }; #undef HOOK_ENTRY diff --git a/src/layer_context.hh b/src/layer_context.hh index da13dc6..95f1cd5 100644 --- a/src/layer_context.hh +++ b/src/layer_context.hh @@ -48,15 +48,15 @@ using dispatch_context_t = typename context_for_t<D>::context; class LayerContext final : public Context { private: - // If this is not null and set to exactly "1", then we should sleep after + // If this is not null and set to exactly 1, then we should sleep after // present. static constexpr auto SLEEP_AFTER_PRESENT_ENV = "LOW_LATENCY_LAYER_SLEEP_AFTER_PRESENT"; - // If this is not null and set to exactly "1", then VK_NV_LOW_LATENCY2 + // If this is not null and set to exactly 1, then VK_NV_low_latency2 // should be provided instead of VK_AMD_anti_lag. static constexpr auto SPOOF_NVIDIA_ENV = - "LOW_LATENCY_LAYER_SPOOF_NV_LOWLATENCY2"; + "LOW_LATENCY_LAYER_SPOOF_NVIDIA"; public: std::mutex mutex; diff --git a/src/queue_context.cc b/src/queue_context.cc index 29dcbfb..d12f03d 100644 --- a/src/queue_context.cc +++ b/src/queue_context.cc @@ -157,21 +157,20 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) { this->drain_submissions_to_frame(); this->drain_frames_to_timings(); - // Call up to notify the device now that we're done with this frame. - // We have to do this because antilag 2 data is sent to the device, not - // any particular queue. - this->device_context.notify_queue_present(*this); - // We should only sleep in present if two conditions are met: // 1. Our antilag_mode isn't set to on, because otherwise the sleep will // be done in input and with far better results. // 2. The 'is_antilag_1_enabled' flag, which exists at the layer's // context, is set. + // + /* + * WIP REFLEX if (this->device_context.antilag_mode != VK_ANTI_LAG_MODE_ON_AMD && this->device_context.instance.layer.is_antilag_1_enabled) { this->sleep_in_present(); } + */ } const auto debug_log_time2 = [](auto& stream, const auto& diff) { diff --git a/src/queue_context.hh b/src/queue_context.hh index 701fc0d..221626f 100644 --- a/src/queue_context.hh +++ b/src/queue_context.hh @@ -57,6 +57,11 @@ class QueueContext final : public Context { std::unique_ptr<TimestampPool> timestamp_pool; + // NVIDIA's extension lets the application explicitly state that this queue + // does not contribute to the frame. AMD's extension has no such mechanism - + // so this will always be false. + bool should_ignore_latency = false; + public: // Potentially in flight queue submissions that come from this queue. struct Submission { |
