Fix refactor latency regression for VK_NV_LowLatency2 by checking if work has already completed

author: Nicolas James <nj3ahxac@gmail.com> 2026-04-08 12:55:29 +1000
committer: Nicolas James <nj3ahxac@gmail.com> 2026-04-08 12:55:29 +1000
commit: 453d5b0052bd17ed74d47570ffff403ffcd9ebb3 (patch)
tree: 14ab71a5ff27d5485faea061f6e8fe26c0e19640 /src
parent: eb9719cc8b9a308654ccd2c3bce8a7047b6e2a1a (diff)
8 files changed, 78 insertions, 20 deletions
diff --git a/src/frame_span.cc b/src/frame_span.cc
index 732d6f3..8764aa1 100644
--- a/src/frame_span.cc
+++ b/src/frame_span.cc
@@ -21,4 +21,11 @@ void FrameSpan::await_completed() const {
     this->head_handle->await_end();
 }
 
+bool FrameSpan::has_completed() const {
+    if (this->tail_handle) {
+        return this->tail_handle->has_end();
+    }
+    return this->head_handle->has_end();
+}
+
 } // namespace low_latency
 \ No newline at end of file
diff --git a/src/frame_span.hh b/src/frame_span.hh
index 5220702..638554b 100644
--- a/src/frame_span.hh
+++ b/src/frame_span.hh
@@ -28,6 +28,8 @@ class FrameSpan {
     void update(std::shared_ptr<TimestampPool::Handle> handle);
 
   public:
+    // Check if GPU work has completed without hanging.
+    bool has_completed() const;
     // Wait for for GPU work to complete.
     void await_completed() const;
 };
diff --git a/src/strategies/low_latency2/queue_strategy.cc b/src/strategies/low_latency2/queue_strategy.cc
index a020c0d..e3ed808 100644
--- a/src/strategies/low_latency2/queue_strategy.cc
+++ b/src/strategies/low_latency2/queue_strategy.cc
@@ -29,15 +29,14 @@ static void notify_submit_impl(LowLatency2QueueStrategy& strategy,
     const auto [iter, inserted] = strategy.frame_spans.try_emplace(present_id);
     if (inserted) {
         iter->second = std::make_unique<FrameSpan>(std::move(handle));
+        // Add our present_id to our ring tracking if it's non-zero.
+        if (present_id) {
+            strategy.stale_present_ids.push_back(present_id);
+        }
     } else {
         iter->second->update(std::move(handle));
     }
 
-    // Add our present_id to our ring tracking if it's non-zero.
-    if (inserted && present_id) {
-        strategy.stale_present_ids.push_back(present_id);
-    }
-
     // Remove stale present_id's if they weren't presented to.
     if (std::size(strategy.stale_present_ids) >
         LowLatency2QueueStrategy::MAX_TRACKED_PRESENTS) {
diff --git a/src/strategies/low_latency2/queue_strategy.hh b/src/strategies/low_latency2/queue_strategy.hh
index 6d41027..2a03c91 100644
--- a/src/strategies/low_latency2/queue_strategy.hh
+++ b/src/strategies/low_latency2/queue_strategy.hh
@@ -16,6 +16,9 @@ class QueueContext;
 
 class LowLatency2QueueStrategy final : public QueueStrategy {
   public:
+    // It's possible that our tracking for present_ids grows without a limit if
+    // present isn't called. To guard against this, we store the last unique
+    // MAX_TRACKED_PRESENTS and use it to evict stale submissions.
     static constexpr auto MAX_TRACKED_PRESENTS = 50;
 
     // Mapping of present_id's to submissions. Grabbed later by the device
diff --git a/src/strategies/low_latency2/swapchain_monitor.cc b/src/strategies/low_latency2/swapchain_monitor.cc
index a70fa6c..7442eec 100644
--- a/src/strategies/low_latency2/swapchain_monitor.cc
+++ b/src/strategies/low_latency2/swapchain_monitor.cc
@@ -35,33 +35,33 @@ void SwapchainMonitor::do_monitor(const std::stop_token stoken) {
     for (;;) {
         auto lock = std::unique_lock{this->mutex};
         this->cv.wait(lock, stoken,
-                      [&]() { return this->semaphore_spans.has_value(); });
+                      [&]() { return !this->pending_signals.empty(); });
 
         // Stop only if we're stopped and we have nothing to signal.
-        if (stoken.stop_requested() && !this->semaphore_spans.has_value()) {
+        if (stoken.stop_requested() && this->pending_signals.empty()) {
             break;
         }
 
         // Grab the most recent semaphore. When work completes, signal it.
-        const auto semaphore_span = std::move(*this->semaphore_spans);
-        this->semaphore_spans.reset();
+        const auto pending_signal = std::move(this->pending_signals.front());
+        this->pending_signals.pop_front();
 
         // If we're stopping, signal the semaphore and don't worry about work
         // actually completing.
         if (stoken.stop_requested()) {
-            semaphore_span.wakeup_semaphore.signal(this->device);
+            pending_signal.wakeup_semaphore.signal(this->device);
             break;
         }
 
         // Unlock, wait for work to finish, lock again.
         lock.unlock();
-        for (const auto& frame_span : semaphore_span.frame_spans) {
+        for (const auto& frame_span : pending_signal.frame_spans) {
             if (frame_span) {
                 frame_span->await_completed();
             }
         }
-
         lock.lock();
+
         using namespace std::chrono;
         if (this->present_delay != 0us) {
             const auto last_time = this->last_signal_time;
@@ -75,7 +75,7 @@ void SwapchainMonitor::do_monitor(const std::stop_token stoken) {
         }
         lock.unlock();
 
-        semaphore_span.wakeup_semaphore.signal(this->device);
+        pending_signal.wakeup_semaphore.signal(this->device);
     }
 }
 
@@ -94,12 +94,19 @@ void SwapchainMonitor::notify_semaphore(const VkSemaphore& timeline_semaphore,
     }
 
     // Signal immediately if we have no outstanding work.
-    if (this->pending_frame_spans.empty()) {
+    if (std::ranges::all_of(this->pending_frame_spans,
+                            [](const auto& frame_span) {
+                                if (!frame_span) {
+                                    return true;
+                                }
+                                return frame_span->has_completed();
+                            })) {
         wakeup_semaphore.signal(this->device);
+        this->pending_signals.clear();
         return;
     }
 
-    this->semaphore_spans.emplace(SemaphoreSpans{
+    this->pending_signals.emplace_back(PendingSignal{
         .wakeup_semaphore = wakeup_semaphore,
         .frame_spans = std::move(this->pending_frame_spans),
     });
diff --git a/src/strategies/low_latency2/swapchain_monitor.hh b/src/strategies/low_latency2/swapchain_monitor.hh
index 837f8e4..a5f8362 100644
--- a/src/strategies/low_latency2/swapchain_monitor.hh
+++ b/src/strategies/low_latency2/swapchain_monitor.hh
@@ -26,16 +26,13 @@ class SwapchainMonitor final {
         void signal(const DeviceContext& device) const;
     };
 
-    // An empty vector here represents our 'no work' state.
     std::vector<std::unique_ptr<FrameSpan>> pending_frame_spans{};
 
-    // A pairing of semaphore -> submissions.
-    // If the Submissions completes then signal the bundled semaphore.
-    struct SemaphoreSpans {
+    struct PendingSignal {
         WakeupSemaphore wakeup_semaphore{};
         std::vector<std::unique_ptr<FrameSpan>> frame_spans{};
     };
-    std::optional<SemaphoreSpans> semaphore_spans{};
+    std::deque<PendingSignal> pending_signals{};
 
   protected:
     const DeviceContext& device;
diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc
index afb12f7..d84169d 100644
--- a/src/timestamp_pool.cc
+++ b/src/timestamp_pool.cc
@@ -207,6 +207,39 @@ TimestampPool::Handle::await_time_impl(const std::uint32_t offset) const {
 void TimestampPool::Handle::await_start() const { this->await_time_impl(0); }
 void TimestampPool::Handle::await_end() const { this->await_time_impl(1); }
 
+std::optional<std::uint64_t>
+TimestampPool::Handle::has_time_impl(const std::uint32_t offset) const {
+
+    const auto& context = this->timestamp_pool.queue_context.device;
+    const auto& vtable = context.vtable;
+    const auto& query_pool = *this->query_chunk.query_pool;
+
+    auto query_result = std::array<std::uint64_t, 2>{};
+
+    const auto result = vtable.GetQueryPoolResults(
+        context.device, query_pool, this->query_index + offset, 1,
+        sizeof(query_result), &query_result, sizeof(query_result),
+        VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT);
+
+    if (result != VK_NOT_READY && result != VK_SUCCESS) {
+        throw result;
+    }
+
+    if (!query_result[1]) {
+        return std::nullopt;
+    }
+    return query_result[0];
+}
+
+// Checks if the time is available - doesn't block.
+bool TimestampPool::Handle::has_start() const {
+    return this->has_time_impl(0).has_value();
+}
+
+bool TimestampPool::Handle::has_end() const {
+    return this->has_time_impl(1).has_value();
+}
+
 TimestampPool::~TimestampPool() {}
 
 } // namespace low_latency
 \ No newline at end of file
diff --git a/src/timestamp_pool.hh b/src/timestamp_pool.hh
index 809c6a4..767455a 100644
--- a/src/timestamp_pool.hh
+++ b/src/timestamp_pool.hh
@@ -10,6 +10,7 @@
 #include <deque>
 #include <memory>
 #include <mutex>
+#include <optional>
 #include <thread>
 #include <unordered_set>
 #include <vector>
@@ -130,6 +131,15 @@ class TimestampPool final {
         // Blocks until the time is available.
         void await_start() const;
         void await_end() const;
+
+      private:
+        std::optional<std::uint64_t>
+        has_time_impl(const std::uint32_t offset) const;
+
+      public:
+        // Checks if the time is available - doesn't block.
+        bool has_start() const;
+        bool has_end() const;
     };
 
   private:
author	Nicolas James <nj3ahxac@gmail.com>	2026-04-08 12:55:29 +1000
committer	Nicolas James <nj3ahxac@gmail.com>	2026-04-08 12:55:29 +1000
commit	453d5b0052bd17ed74d47570ffff403ffcd9ebb3 (patch)
tree	14ab71a5ff27d5485faea061f6e8fe26c0e19640 /src
parent	eb9719cc8b9a308654ccd2c3bce8a7047b6e2a1a (diff)