From 79c9f28f804b591a2981f9d7c224747db906b1e8 Mon Sep 17 00:00:00 2001
From: Nicolas James <nj3ahxac@gmail.com>
Date: Thu, 19 Feb 2026 17:02:40 +1100
Subject: WIP timing improvements

---
 src/layer.cc         | 34 +++++++++++++++++++++--
 src/queue_context.cc | 78 ++++++++++++++++++++++++++++++++++------------------
 src/queue_context.hh | 27 ++++++++++--------
 3 files changed, 99 insertions(+), 40 deletions(-)
diff --git a/src/layer.cc b/src/layer.cc
index c19fbfc..c49e6a0 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -449,6 +449,32 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
         return vtable.QueueSubmit(queue, submit_count, submit_infos, fence);
     }
 
+    // What's happening here?
+    // We are making a very modest modification to all vkQueueSubmits where we
+    // inject a start and end timestamp query command buffer that writes when
+    // the GPU started and finished work for each submission. It's important to
+    // note that we deliberately do *NOT* use or modify any semaphores
+    // as a mechanism to signal completion or the availability of these submits
+    // for multiple reasons:
+    //     1. Modifying semaphores (particuarly in vkQueueSubmit1) is ANNOYING
+    //        done correctly. The pNext chain is const and difficult to modify
+    //        without traversing the entire thing and doing surgical deep copies
+    //        and patches for multiple pNext's sType's. It's easier to leave it
+    //        alone.
+    //     2. Semaphores only signal at the end of their work, so we cannot use
+    //        them as a mechanism to know if work has started without doing
+    //        another dummy submission. This adds complexity and also skews our
+    //        timestamps slightly.
+    //     3. Semaphores can be waited which sounds nice in theory, but in my
+    //        own testing waiting on semaphores can cause scheduling issues and
+    //        cause wakeups as late as 1ms from when it was signalled, which is
+    //        unbelievably bad if we're trying to do frame pacing. This means
+    //        we are going to have to do a spinlock poll anyway.
+    //     4. Guess what info we need? Timestamp information. Guess what
+    //        supports polling of an availability bit? Timestamp information.
+    //        Why bother with semaphores at all then? Polling a semaphore might
+    //        be faster, but the difference appears to be negligible.
+
     using cbs_t = std::vector<VkCommandBuffer>;
     auto next_submits = std::vector<VkSubmitInfo>{};
 
@@ -462,13 +488,15 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
     // more explicit + insurance if that changes.
     auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{};
 
+    const auto now = std::chrono::steady_clock::now();
+
     std::ranges::transform(
         std::span{submit_infos, submit_count}, std::back_inserter(next_submits),
         [&](const auto& submit) {
             const auto head_handle = queue_context->timestamp_pool->acquire();
             const auto tail_handle = queue_context->timestamp_pool->acquire();
             head_handle->setup_command_buffers(*tail_handle, *queue_context);
-            queue_context->notify_submit(submit, head_handle, tail_handle);
+            queue_context->notify_submit(submit, head_handle, tail_handle, now);
 
             handles.emplace_back(head_handle);
             handles.emplace_back(tail_handle);
@@ -508,6 +536,8 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
     auto next_submits = std::vector<VkSubmitInfo2>{};
     auto next_cbs = std::vector<std::unique_ptr<cbs_t>>{};
     auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{};
+    
+    const auto now = std::chrono::steady_clock::now();
 
     std::ranges::transform(
         std::span{submit_infos, submit_count}, std::back_inserter(next_submits),
@@ -515,7 +545,7 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
             const auto head_handle = queue_context->timestamp_pool->acquire();
             const auto tail_handle = queue_context->timestamp_pool->acquire();
             head_handle->setup_command_buffers(*tail_handle, *queue_context);
-            queue_context->notify_submit(submit, head_handle, tail_handle);
+            queue_context->notify_submit(submit, head_handle, tail_handle, now);
 
             next_cbs.emplace_back([&]() -> auto {
                 auto cbs = std::make_unique<cbs_t>();
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 2f0a89d..e210685 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -5,7 +5,6 @@
 #include <algorithm>
 #include <chrono>
 #include <iostream>
-#include <limits>
 #include <ranges>
 #include <span>
 
@@ -51,7 +50,8 @@ QueueContext::~QueueContext() {
 void QueueContext::notify_submit(
     const VkSubmitInfo& info,
     const std::shared_ptr<TimestampPool::Handle> head_handle,
-    const std::shared_ptr<TimestampPool::Handle> tail_handle) {
+    const std::shared_ptr<TimestampPool::Handle> tail_handle,
+    const DeviceContext::Clock::time_point_t& now) {
 
     auto signals = std::unordered_set<VkSemaphore>{};
     auto waits = std::unordered_set<VkSemaphore>{};
@@ -71,7 +71,7 @@ void QueueContext::notify_submit(
     }
 
     this->submissions.emplace_back(std::make_unique<Submission>(
-        std::move(signals), std::move(waits), head_handle, tail_handle));
+        std::move(signals), std::move(waits), head_handle, tail_handle, now));
 
     // TODO HACK
     if (std::size(this->submissions) > 100) {
@@ -82,7 +82,8 @@ void QueueContext::notify_submit(
 void QueueContext::notify_submit(
     const VkSubmitInfo2& info,
     const std::shared_ptr<TimestampPool::Handle> head_handle,
-    const std::shared_ptr<TimestampPool::Handle> tail_handle) {
+    const std::shared_ptr<TimestampPool::Handle> tail_handle,
+    const DeviceContext::Clock::time_point_t& now) {
 
     auto signals = std::unordered_set<VkSemaphore>{};
     auto waits = std::unordered_set<VkSemaphore>{};
@@ -108,7 +109,7 @@ void QueueContext::notify_submit(
     }
 
     this->submissions.emplace_back(std::make_unique<Submission>(
-        std::move(signals), std::move(waits), head_handle, tail_handle));
+        std::move(signals), std::move(waits), head_handle, tail_handle, now));
 
     // TODO HACK
     if (std::size(this->submissions) > 100) {
@@ -162,10 +163,9 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
         return *start_iter;
     }();
 
-    this->in_flight_frames.emplace_back(Frame{
-        .prev_frame_last_submit = prev_frame_last_submit,
-        .submissions = std::move(this->submissions),
-    });
+    this->in_flight_frames.emplace_back(
+        Frame{.submissions = std::move(this->submissions),
+              .cpu_post_present_time = std::chrono::steady_clock::now()});
     assert(std::size(this->in_flight_frames.back().submissions));
     // *valid but unspecified state after move, so clear!*
     this->submissions.clear();
@@ -270,14 +270,27 @@ void QueueContext::process_frames() {
                 return gputime + (end - start);
             });
 
-        const auto start =
-            frame.prev_frame_last_submit->end_handle->get_time_required();
-        const auto end = merged.back().end;
-        const auto not_gputime = (end - start) - gputime;
+        // Our cpu_start value here refers to the time when the CPU was allowed
+        // to move past the present call and, in theory, begin cpu work on the
+        // next frame.
+        const auto cpu_start = [&]() -> auto {
+            if (const auto it = std::rbegin(this->timings);
+                it != std::rend(this->timings)) {
+                return (*it)->frame.cpu_post_present_time;
+            }
+            // This will happen *once*, and only for the first frame. We don't
+            // have a way of knowing when the CPU first started work obviously
+            // in this case because we're a vulkan layer and not omniscient.
+            // Just return our first submit's start for this edge case.
+            return frame.submissions.front()->start_handle->get_time_required();
+        }();
+
+        const auto cputime =
+            frame.submissions.front()->enqueued_time - cpu_start;
 
         auto timing = Timing{
             .gputime = gputime,
-            .not_gputime = not_gputime,
+            .cputime = cputime,
             .frame = frame,
         };
         this->timings.emplace_back(std::make_unique<Timing>(timing));
@@ -307,13 +320,14 @@ void QueueContext::sleep_in_present() {
         return;
     }
 
-    // This is doing more than it looks like one line can do (tbf it is a long
-    // line). It's getting the most recent frame and waiting until its start has
+    // This is getting the most recent frame and waiting until its start has
     // begun. This means that, in the case of >1 frame in flight, it's draining
     // all of them before we're allowed to move forward.
-    const auto a = this->in_flight_frames.back()
-                       .submissions.front()
-                       ->start_handle->get_time_spinlock();
+    const auto first_gpu_work = [&]() -> auto {
+        const auto& most_recent_frame = this->in_flight_frames.back();
+        const auto& first_submission = most_recent_frame.submissions.front();
+        return first_submission->start_handle->get_time_spinlock();
+    }();
 
     // Process frames because as stated above, we might have multiple frames
     // now completed.
@@ -343,20 +357,30 @@ void QueueContext::sleep_in_present() {
 
     const auto expected_gputime =
         calc_median([](const auto& timing) { return timing->gputime; });
-    const auto expected_not_gputime =
-        calc_median([](const auto& timing) { return timing->not_gputime; });
+    const auto expected_cputime =
+        calc_median([](const auto& timing) { return timing->cputime; });
 
     std::cerr << "    expected gputime: ";
     debug_log_time(expected_gputime);
-    std::cerr << "    expected not_gputime: ";
-    debug_log_time(expected_not_gputime);
+    std::cerr << "    expected cputime: ";
+    debug_log_time(expected_cputime);
+
+    // Should look like this:
+    //              total_length = expected_gputime
+    // |------------------------x------------------------------|
+    // ^ first_gpu_work        now               last_gpu_work ^
 
     const auto now = std::chrono::steady_clock::now();
-    const auto dist = now - a;
-    const auto expected = expected_gputime - dist;
+    const auto dist = now - first_gpu_work;
+    const auto expected_dist_to_last = expected_gputime - dist;
+
+    const auto wait_time = expected_dist_to_last - expected_cputime;
+
+    auto& frame = this->in_flight_frames.back();
+    const auto& last_gpu_work = frame.submissions.back()->end_handle;
+    last_gpu_work->get_time_spinlock(now + wait_time);
 
-    const auto& frame = this->in_flight_frames.back();
-    frame.submissions.back()->end_handle->get_time_spinlock(now + expected);
+    frame.cpu_post_present_time = std::chrono::steady_clock::now();
 }
 
 } // namespace low_latency
\ No newline at end of file
diff --git a/src/queue_context.hh b/src/queue_context.hh
index f8782de..0c5e51f 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -36,6 +36,8 @@ class QueueContext final : public Context {
 
         const std::shared_ptr<TimestampPool::Handle> start_handle;
         const std::shared_ptr<TimestampPool::Handle> end_handle;
+        
+        const DeviceContext::Clock::time_point_t enqueued_time;
 
         std::string debug;
     };
@@ -48,14 +50,17 @@ class QueueContext final : public Context {
     // present was called.
     // std::size(submissions) >= 1 btw
     struct Frame {
-        submission_ptr_t prev_frame_last_submit;
         std::deque<submission_ptr_t> submissions;
+
+        // the point that control flow was returned from VkQueuePresentKHR back to the
+        // application. 
+        DeviceContext::Clock::time_point_t cpu_post_present_time;
     };
     std::deque<Frame> in_flight_frames;
 
     // Completed frames.
     struct Timing {
-        DeviceContext::Clock::time_point_t::duration gputime, not_gputime;
+        DeviceContext::Clock::time_point_t::duration gputime, cputime;
 
         Frame frame;
     };
@@ -70,15 +75,15 @@ class QueueContext final : public Context {
     virtual ~QueueContext();
 
   public:
-    void
-    notify_submit(const VkSubmitInfo& info,
-                  const std::shared_ptr<TimestampPool::Handle> head_handle,
-                  const std::shared_ptr<TimestampPool::Handle> tail_handle);
-
-    void
-    notify_submit(const VkSubmitInfo2& info,
-                  const std::shared_ptr<TimestampPool::Handle> head_handle,
-                  const std::shared_ptr<TimestampPool::Handle> tail_handle);
+    void notify_submit(const VkSubmitInfo& info,
+                       const std::shared_ptr<TimestampPool::Handle> head_handle,
+                       const std::shared_ptr<TimestampPool::Handle> tail_handle,
+                       const DeviceContext::Clock::time_point_t& now);
+
+    void notify_submit(const VkSubmitInfo2& info,
+                       const std::shared_ptr<TimestampPool::Handle> head_handle,
+                       const std::shared_ptr<TimestampPool::Handle> tail_handle,
+                       const DeviceContext::Clock::time_point_t& now);
 
     void notify_present(const VkPresentInfoKHR& info);
 
-- 
cgit v1.2.3