Implement QueueNotifyOutOfBandNV, SetLatencySleepModeNV - also some bookkeeping

author: Nicolas James <nj3ahxac@gmail.com> 2026-03-26 19:50:19 +1100
committer: Nicolas James <nj3ahxac@gmail.com> 2026-03-26 19:50:19 +1100
commit: 4ab8c7b97ea513e209705907afce9852934a7d86 (patch)
tree: 558b7e5316de7c6cd1f15ee0920dfb3ea815fd92 /src
parent: 7810f837b3cfa7dc45f360282cd1cafb2c3e6ca6 (diff)
6 files changed, 126 insertions, 73 deletions
diff --git a/src/device_context.cc b/src/device_context.cc
index cea0540..58737e2 100644
--- a/src/device_context.cc
+++ b/src/device_context.cc
@@ -1,5 +1,4 @@
 #include "device_context.hh"
-#include "queue_context.hh"
 
 #include <time.h>
 #include <utility>
@@ -23,7 +22,6 @@ DeviceContext::DeviceContext(InstanceContext& parent_instance,
 }
 
 DeviceContext::~DeviceContext() {
-    this->present_queue.reset();
     // We will let the destructor handle clearing here, but they should be
     // unique by now (ie, removed from the layer's context map).
     for (const auto& [queue, queue_context] : this->queues) {
@@ -94,6 +92,9 @@ DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const {
 }
 
 void DeviceContext::sleep_in_input() {
+    // TODO
+
+    /*
     // Present hasn't happened yet, we don't know what queue to attack.
     if (!this->present_queue) {
         return;
@@ -121,32 +122,30 @@ void DeviceContext::sleep_in_input() {
     // would get huge frame drops, loss of throughput, and the GPU would even
     // clock down. So naturally I am concerned about this approach, but it seems
     // to perform well so far in my own testing and is just beautifully elegant.
+    */
 }
 
-void DeviceContext::notify_antilag_update(const VkAntiLagDataAMD& data) {
-    this->antilag_mode = data.mode;
-    this->antilag_fps = data.maxFPS; // TODO
+void DeviceContext::update_swapchain_infos(
+    const std::optional<VkSwapchainKHR> target,
+    const std::chrono::milliseconds& present_delay,
+    const bool was_low_latency_requested) {
 
-    // This might not be provided (probably just to set some settings?).
-    if (!data.pPresentationInfo) {
-        return;
-    }
+    const auto write = SwapchainInfo{
+        .present_delay = present_delay,
+        .was_low_latency_requested = was_low_latency_requested,
+    };
 
-    // Only care about the input stage for now.
-    if (data.pPresentationInfo->stage != VK_ANTI_LAG_STAGE_INPUT_AMD) {
+    if (target.has_value()) {
+        const auto iter = this->swapchain_infos.find(*target);
+        assert(iter != std::end(this->swapchain_infos)); // Must exist (spec).
+        iter->second = write;
         return;
     }
 
-    if (this->antilag_mode != VK_ANTI_LAG_MODE_ON_AMD) {
-        return;
+    // If we don't have a target (AMD's anti_lag), just write it to everything.
+    for (auto& iter : this->swapchain_infos) {
+        iter.second = write;
     }
-
-    this->sleep_in_input();
-}
-
-void DeviceContext::notify_queue_present(const QueueContext& queue) {
-    assert(this->queues.contains(queue.queue));
-    this->present_queue = this->queues[queue.queue];
 }
 
 } // namespace low_latency
 \ No newline at end of file
diff --git a/src/device_context.hh b/src/device_context.hh
index c76f376..6b5f000 100644
--- a/src/device_context.hh
+++ b/src/device_context.hh
@@ -28,6 +28,13 @@ struct DeviceContext final : public Context {
     const VkDevice device;
     const VkuDeviceDispatchTable vtable;
 
+    // Tiny struct to represent any swapchain's low latency state.
+    struct SwapchainInfo {
+        std::chrono::milliseconds present_delay = std::chrono::milliseconds{0};
+        bool was_low_latency_requested = false;
+    };
+    std::unordered_map<VkSwapchainKHR, SwapchainInfo> swapchain_infos{};
+
     std::unordered_map<VkQueue, std::shared_ptr<QueueContext>> queues;
 
     struct Clock {
@@ -58,15 +65,6 @@ struct DeviceContext final : public Context {
     };
     std::unique_ptr<Clock> clock;
 
-    std::uint32_t antilag_fps = 0; // TODO
-    VkAntiLagModeAMD antilag_mode = VK_ANTI_LAG_MODE_DRIVER_CONTROL_AMD;
-
-    // The queue used in the last present.
-    std::shared_ptr<QueueContext> present_queue;
-
-  private:
-    void sleep_in_input();
-
   public:
     DeviceContext(InstanceContext& parent_instance,
                   PhysicalDeviceContext& parent_physical,
@@ -75,9 +73,13 @@ struct DeviceContext final : public Context {
     virtual ~DeviceContext();
 
   public:
-    void notify_antilag_update(const VkAntiLagDataAMD& data);
+    void sleep_in_input();
 
-    void notify_queue_present(const QueueContext& queue);
+    // Updates the settings associated with that swapchain. If none is provided
+    // all swapchains are set to this value.
+    void update_swapchain_infos(const std::optional<VkSwapchainKHR> target,
+                                const std::chrono::milliseconds& present_delay,
+                                const bool was_low_latency_requested);
 };
 
 }; // namespace low_latency
diff --git a/src/layer.cc b/src/layer.cc
index 77ce296..438f331 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -295,6 +295,8 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
     DEVICE_VTABLE_LOAD(GetCalibratedTimestampsKHR);
     DEVICE_VTABLE_LOAD(ResetQueryPoolEXT);
     DEVICE_VTABLE_LOAD(SignalSemaphore);
+    DEVICE_VTABLE_LOAD(CreateSwapchainKHR);
+    DEVICE_VTABLE_LOAD(DestroySwapchainKHR);
 #undef DEVICE_VTABLE_LOAD
 
     const auto key = layer_context.get_key(*pDevice);
@@ -735,17 +737,57 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR(
     lsc->presentModeCount = num_to_write;
 }
 
+static VKAPI_ATTR VkResult VKAPI_CALL CreateSwapchainKHR(
+    VkDevice device, const VkSwapchainCreateInfoKHR* pCreateInfo,
+    const VkAllocationCallbacks* pAllocator, VkSwapchainKHR* pSwapchain) {
+
+    const auto context = layer_context.get_context(device);
+
+    if (const auto result = context->vtable.CreateSwapchainKHR(
+            device, pCreateInfo, pAllocator, pSwapchain);
+        result != VK_SUCCESS) {
+
+        return result;
+    }
+
+    assert(context->swapchain_infos.try_emplace(*pSwapchain).second);
+
+    return VK_SUCCESS;
+}
+
+static VKAPI_ATTR void VKAPI_CALL
+DestroySwapchainKHR(VkDevice device, VkSwapchainKHR swapchain,
+                    const VkAllocationCallbacks* pAllocator) {
+    const auto context = layer_context.get_context(device);
+
+    assert(context->swapchain_infos.erase(swapchain));
+
+    context->vtable.DestroySwapchainKHR(device, swapchain, pAllocator);
+}
+
 static VKAPI_ATTR void VKAPI_CALL
 AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) {
     const auto context = layer_context.get_context(device);
     assert(pData);
-    context->notify_antilag_update(*pData);
-}
 
-// Stubs for nvidia low latency 2.
-void GetLatencyTimingsNV(VkDevice device, VkSwapchainKHR swapchain,
-                         VkGetLatencyMarkerInfoNV* pLatencyMarkerInfo) {
-    // STUB
+    // AL2 is a synchronous while NVIDIA's low_latencty2 is asynchronous.
+    // It's difficult to model an asynchronous impl inside a synchronous impl,
+    // but it's easy to do the inverse. As a result, we should implement
+    // NVIDIA's method and then have a working AL2 implementation follow using
+    // that existing code path.
+
+    using namespace std::chrono;
+    const auto present_delay = duration_cast<milliseconds>(1s / pData->maxFPS);
+    context->update_swapchain_infos(std::nullopt, present_delay,
+                                    (pData->mode == VK_ANTI_LAG_MODE_ON_AMD));
+
+    if (!pData->pPresentationInfo) {
+        return;
+    }
+
+    if (pData->pPresentationInfo->stage == VK_ANTI_LAG_STAGE_INPUT_AMD) {
+        context->sleep_in_input();
+    }
 }
 
 VkResult LatencySleepNV(VkDevice device, VkSwapchainKHR swapchain,
@@ -754,50 +796,54 @@ VkResult LatencySleepNV(VkDevice device, VkSwapchainKHR swapchain,
     const auto context = layer_context.get_context(device);
     assert(pSleepInfo);
 
-    // Keep going.
-    if (pSleepInfo->signalSemaphore) {
-
-        // This is a hack obviously. I will have to associate queue submits with
-        // a semaphore and signal it correctly later. I'm not sure about the
-        // implications regarding multithreading, will have to think a bit about how to do this cleanly
-        // with our current anti lag.
-        static std::uint32_t counter = 1024;
-
-        const auto ssi = VkSemaphoreSignalInfo{
-            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
-            .semaphore = pSleepInfo->signalSemaphore,
-            .value = counter,
-        };
+    // TODO sleep here
 
-        // So we don't wait and this becomes a no-op instead of a freeze!
-        context->vtable.SignalSemaphore(device, &ssi);
-
-        ++counter;
-    }
-    // STUB
     return VK_SUCCESS;
 }
 
 void QueueNotifyOutOfBandNV(VkQueue queue,
                             const VkOutOfBandQueueTypeInfoNV* pQueueTypeInfo) {
-    // STUB
-}
+    // This is really thoughtful from NVIDIA. Having the application explicitly
+    // state which queues should be ignored for latency evaluation is far
+    // superior to AMD's guessing game.
+    // Kind of interesting how you can't turn it back on once it's turned off.
+    // Also I really have no idea why pQueueTypeInfo's VkOutOfBandQueueTypeNV
+    // enum even exists (I guess we will find out later when nothing works).
+    const auto context = layer_context.get_context(queue);
 
-void SetLatencyMarkerNV(VkDevice device, VkSwapchainKHR swapchain,
-                        const VkSetLatencyMarkerInfoNV* pLatencyMarkerInfo) {
-    // STUB
+    context->should_ignore_latency = true;
 }
 
 VkResult SetLatencySleepModeNV(VkDevice device, VkSwapchainKHR swapchain,
                                const VkLatencySleepModeInfoNV* pSleepModeInfo) {
-
     const auto context = layer_context.get_context(device);
-    assert(pSleepModeInfo);
 
-    // STUB
+    using namespace std::chrono;
+    if (pSleepModeInfo) {
+        context->update_swapchain_infos(
+            swapchain, milliseconds{pSleepModeInfo->minimumIntervalUs},
+            pSleepModeInfo->lowLatencyMode);
+    } else {
+        // If pSleepModeInfo is nullptr, it means no delay and no low latency.
+        context->update_swapchain_infos(swapchain, milliseconds{0}, false);
+    }
     return VK_SUCCESS;
 }
 
+void SetLatencyMarkerNV(VkDevice device, VkSwapchainKHR swapchain,
+                        const VkSetLatencyMarkerInfoNV* pLatencyMarkerInfo) {
+    // STUB
+    // We will probably end up making use of this in the future, but afaict it's
+    // not relevant for this layer's operation just yet. This function is
+    // NVIDIA's way of giving developers insight into their render pipeline.
+}
+
+void GetLatencyTimingsNV(VkDevice device, VkSwapchainKHR swapchain,
+                         VkGetLatencyMarkerInfoNV* pLatencyMarkerInfo) {
+    // STUB
+    // Just like SetLatencyMarkerNV this isn't relevant for us just yet.
+}
+
 } // namespace low_latency
 
 // This is a bit of template hackery which generates a wrapper function for each
@@ -885,6 +931,8 @@ static const auto device_functions = func_map_t{
     HOOK_ENTRY("vkSetLatencyMarkerNV", low_latency::SetLatencyMarkerNV),
     HOOK_ENTRY("vkSetLatencySleepModeNV", low_latency::SetLatencySleepModeNV),
 
+    HOOK_ENTRY("vkCreateSwapchainKHR", low_latency::CreateSwapchainKHR),
+    HOOK_ENTRY("vkDestroySwapchainKHR", low_latency::DestroySwapchainKHR),
 };
 #undef HOOK_ENTRY
 
diff --git a/src/layer_context.hh b/src/layer_context.hh
index da13dc6..95f1cd5 100644
--- a/src/layer_context.hh
+++ b/src/layer_context.hh
@@ -48,15 +48,15 @@ using dispatch_context_t = typename context_for_t<D>::context;
 
 class LayerContext final : public Context {
   private:
-    // If this is not null and set to exactly "1", then we should sleep after
+    // If this is not null and set to exactly 1, then we should sleep after
     // present.
     static constexpr auto SLEEP_AFTER_PRESENT_ENV =
         "LOW_LATENCY_LAYER_SLEEP_AFTER_PRESENT";
 
-    // If this is not null and set to exactly "1", then VK_NV_LOW_LATENCY2
+    // If this is not null and set to exactly 1, then VK_NV_low_latency2
     // should be provided instead of VK_AMD_anti_lag.
     static constexpr auto SPOOF_NVIDIA_ENV =
-        "LOW_LATENCY_LAYER_SPOOF_NV_LOWLATENCY2";
+        "LOW_LATENCY_LAYER_SPOOF_NVIDIA";
 
   public:
     std::mutex mutex;
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 29dcbfb..d12f03d 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -157,21 +157,20 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
     this->drain_submissions_to_frame();
     this->drain_frames_to_timings();
 
-    // Call up to notify the device now that we're done with this frame.
-    // We have to do this because antilag 2 data is sent to the device, not
-    // any particular queue.
-    this->device_context.notify_queue_present(*this);
-
     // We should only sleep in present if two conditions are met:
     //     1. Our antilag_mode isn't set to on, because otherwise the sleep will
     //        be done in input and with far better results.
     //     2. The 'is_antilag_1_enabled' flag, which exists at the layer's
     //        context, is set.
+    //        
+    /*
+     * WIP REFLEX
     if (this->device_context.antilag_mode != VK_ANTI_LAG_MODE_ON_AMD &&
         this->device_context.instance.layer.is_antilag_1_enabled) {
 
         this->sleep_in_present();
     }
+    */
 }
 
 const auto debug_log_time2 = [](auto& stream, const auto& diff) {
diff --git a/src/queue_context.hh b/src/queue_context.hh
index 701fc0d..221626f 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -57,6 +57,11 @@ class QueueContext final : public Context {
 
     std::unique_ptr<TimestampPool> timestamp_pool;
 
+    // NVIDIA's extension lets the application explicitly state that this queue
+    // does not contribute to the frame. AMD's extension has no such mechanism -
+    // so this will always be false.
+    bool should_ignore_latency = false;
+
   public:
     // Potentially in flight queue submissions that come from this queue.
     struct Submission {
author	Nicolas James <nj3ahxac@gmail.com>	2026-03-26 19:50:19 +1100
committer	Nicolas James <nj3ahxac@gmail.com>	2026-03-26 19:50:19 +1100
commit	4ab8c7b97ea513e209705907afce9852934a7d86 (patch)
tree	558b7e5316de7c6cd1f15ee0920dfb3ea815fd92 /src
parent	7810f837b3cfa7dc45f360282cd1cafb2c3e6ca6 (diff)