Fix missing cpu time between gpu presents, simplify timestamp->get_time()

author: Nicolas James <nj3ahxac@gmail.com> 2026-02-18 14:51:21 +1100
committer: Nicolas James <nj3ahxac@gmail.com> 2026-02-18 14:51:21 +1100
commit: bb7c9b56e1710a7f2a3f4bb57f181e4fa196aba2 (patch)
tree: e9de8b88afcb339af9a7c87156511f6cc238f829
parent: b7414a760c183d4987090c110ddacca277983e17 (diff)
4 files changed, 77 insertions, 58 deletions
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 7121898..363149b 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -152,7 +152,6 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
     // completely redundant, in all cases it was exactly what we have here. But
     // I could be wrong.
 
-    // that wa
     const auto start_iter = std::begin(this->submissions);
     // no op submit?
     if (start_iter == std::end(this->submissions)) {
@@ -164,13 +163,31 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
     (*start_iter)->debug += "first_during_present ";
     (*last_iter)->debug += "last_during_present ";
 
-    // TODO We probably want to walk through other graphics queues and get
-    // the time that they submitted at some stage. Somewhat difficult to do,
-    // we probably need to create a graph of dependencies but it seems like
-    // overengineering. I will just get this working well before I deal
-    // with complicated edge cases.
+    // The last submission is either in flight, already processed, or we
+    // just happen to be the first frame and we can just set it to our start
+    // with little conseuqence.
+    const auto prev_frame_last_submit = [&]() -> auto {
+        if (const auto iter = std::rbegin(this->in_flight_frames);
+            iter != std::rend(this->in_flight_frames)) {
+
+            assert(!iter->submissions.empty());
+            return iter->submissions.back();
+        }
+
+        if (const auto iter = std::rbegin(this->timings);
+            iter != std::rend(this->timings)) {
+
+            const auto& submissions = (*iter)->frame.submissions;
+            assert(!submissions.empty());
+
+            return submissions.back();
+        }
+
+        return *start_iter;
+    }();
 
     this->in_flight_frames.emplace_back(Frame{
+        .prev_frame_last_submit = prev_frame_last_submit,
         .submissions = std::move(this->submissions),
         .sequence = (*last_iter)->sequence,
     });
@@ -240,9 +257,7 @@ void QueueContext::process_frames() {
                 frame.submissions, std::back_inserter(intervals),
                 [&, this](const auto& submission) {
                     const auto get_time = [&, this](const auto& handle) {
-                        const auto t = handle->get_ticks(*this->timestamp_pool);
-                        assert(t.has_value()); // guaranteed from seq.
-                        return clock.ticks_to_time(*t);
+                        return handle->get_time();
                     };
 
                     return Interval{
@@ -280,10 +295,15 @@ void QueueContext::process_frames() {
                 return gputime + (end - start);
             });
 
-        const auto start = merged.front().start;
+        // The start should be the previous frame's last submission, NOT when we
+        // start here. Otherwise, we won't account for the time between the
+        // previous frame's last submission and our first submission, which
+        // could genuinely be a large period of time. So look for it in timings,
+        // because it's guaranteed to be there at this stage. Either that, or
+        // we're the first frame, and it doesn't really matter.
+        const auto start = frame.prev_frame_last_submit->end_handle->get_time();
         const auto end = merged.back().end;
-        const auto not_gputime =
-            (merged.back().end - merged.front().start) - gputime;
+        const auto not_gputime = (end - start) - gputime;
 
         auto timing = Timing{
             .gputime = gputime,
@@ -304,7 +324,6 @@ void QueueContext::process_frames() {
     }
 }
 
-using opt_time_point_t = std::optional<DeviceContext::Clock::time_point_t>;
 void QueueContext::sleep_in_present() {
     const auto& device = this->device_context;
     const auto& vtable = device.vtable;
@@ -350,27 +369,20 @@ void QueueContext::sleep_in_present() {
         return;
     }
 
-    const auto expected_gputime = [&, this]() {
+    const auto calc_median = [&, this](const auto& getter) {
         auto vect = std::vector<Timing*>{};
         std::ranges::transform(this->timings, std::back_inserter(vect),
                                [](const auto& timing) { return timing.get(); });
-        std::ranges::sort(vect, [](const auto& a, const auto& b) {
-            return a->gputime < b->gputime;
+        std::ranges::sort(vect, [&](const auto& a, const auto& b) {
+            return getter(a) < getter(b);
         });
-        // return vect[0]->frametime;
-        return vect[std::size(vect) / 2]->gputime;
-    }();
+        return getter(vect[std::size(vect) / 2]);
+    };
 
-    const auto expected_not_gputime = [&, this]() {
-        auto vect = std::vector<Timing*>{};
-        std::ranges::transform(this->timings, std::back_inserter(vect),
-                               [](const auto& timing) { return timing.get(); });
-        std::ranges::sort(vect, [](const auto& a, const auto& b) {
-            return a->not_gputime < b->not_gputime;
-        });
-        // return vect[0]->frametime;
-        return vect[std::size(vect) / 2]->not_gputime;
-    }();
+    const auto expected_gputime =
+        calc_median([](const auto& timing) { return timing->gputime; });
+    const auto expected_not_gputime =
+        calc_median([](const auto& timing) { return timing->not_gputime; });
 
     std::cerr << "    expected gputime: ";
     debug_log_time(expected_gputime);
@@ -413,13 +425,7 @@ void QueueContext::sleep_in_present() {
 
     // We now know that A is available because its semaphore has been
     // signalled.
-
-    const auto a = [&, this]() {
-        const auto& handle = frame.submissions.front()->start_handle;
-        const auto ticks = handle->get_ticks(*this->timestamp_pool);
-        assert(ticks.has_value());
-        return device_context.clock.ticks_to_time(*ticks);
-    }();
+    const auto a = frame.prev_frame_last_submit->end_handle->get_time();
 
     const auto now = std::chrono::steady_clock::now();
     const auto dist = now - a;
diff --git a/src/queue_context.hh b/src/queue_context.hh
index ed13465..219e6fb 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -48,10 +48,12 @@ class QueueContext final : public Context {
     std::deque<submission_ptr_t> submissions;
 
     // In flight frame submissions grouped together.
-    // The first element in the vector refers to the first submission that
+    // The first element in the deque refers to the first submission that
     // contributed to that frame. The last element is the last submission before
     // present was called.
+    // std::size(submissions) >= 1 btw
     struct Frame {
+        submission_ptr_t prev_frame_last_submit;
         std::deque<submission_ptr_t> submissions;
         std::uint64_t sequence;
     };
diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc
index 01afe07..854fae1 100644
--- a/src/timestamp_pool.cc
+++ b/src/timestamp_pool.cc
@@ -76,18 +76,19 @@ std::shared_ptr<TimestampPool::Handle> TimestampPool::acquire() {
     const auto query_index = *std::begin(indices);
     assert(indices.erase(query_index));
 
-    return std::make_shared<Handle>(*not_empty_iter, query_index);
+    return std::make_shared<Handle>(*this, *not_empty_iter, query_index);
 }
 
-TimestampPool::Handle::Handle(const std::shared_ptr<QueryChunk>& origin_chunk,
+TimestampPool::Handle::Handle(const TimestampPool& timestamp_pool,
+                              const std::shared_ptr<QueryChunk>& origin_chunk,
                               const std::uint64_t& query_index)
-    : query_pool(origin_chunk->query_pool), query_index(query_index),
-      origin_chunk(origin_chunk),
+    : timestamp_pool(timestamp_pool), query_pool(origin_chunk->query_pool),
+      query_index(query_index), origin_chunk(origin_chunk),
       command_buffer((*origin_chunk->command_buffers)[query_index]) {}
 
 TimestampPool::Handle::~Handle() {
-    // Parent destructing shouldn't mean we should have a bunch of insertions
-    // for zero reason.
+    // Parent destructing shouldn't mean we should have a bunch of
+    // insertions for zero reason.
     if (const auto ptr = this->origin_chunk.lock(); ptr) {
         assert(ptr->free_indices->insert(this->query_index).second);
     }
@@ -122,29 +123,35 @@ void TimestampPool::Handle::setup_command_buffers(
     vtable.EndCommandBuffer(tail.command_buffer);
 }
 
-std::optional<std::uint64_t>
-TimestampPool::Handle::get_ticks(const TimestampPool& pool) {
-
-    const auto& device_context = pool.queue_context.device_context;
-    const auto& vtable = device_context.vtable;
+DeviceContext::Clock::time_point_t TimestampPool::Handle::get_time() {
+    const auto& device_ctx = this->timestamp_pool.queue_context.device_context;
+    const auto& vtable = device_ctx.vtable;
 
+    // For debug builds, we're going to query the availability bit so we can
+    // assert that after the semaphore has flagged it as naturally available.
     struct QueryResult {
         std::uint64_t value;
+#ifndef NDEBUG
         std::uint64_t available;
+#endif
     };
     auto query_result = QueryResult{};
 
+    constexpr auto query_flags = []() -> auto {
+        auto flag = VkQueryResultFlags{VK_QUERY_RESULT_64_BIT};
+#ifndef NDEBUG
+        flag |= VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
+#endif
+        return flag;
+    }();
+
     const auto r = vtable.GetQueryPoolResults(
-        device_context.device, query_pool, this->query_index, 1,
-        sizeof(query_result), &query_result, sizeof(query_result),
-        VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT);
+        device_ctx.device, query_pool, this->query_index, 1,
+        sizeof(query_result), &query_result, sizeof(query_result), query_flags);
 
-    assert(r == VK_SUCCESS || r == VK_NOT_READY);
+    assert(r == VK_SUCCESS && query_result.available);
 
-    if (!query_result.available) {
-        return std::nullopt;
-    }
-    return query_result.value;
+    return device_ctx.clock.ticks_to_time(query_result.value);
 }
 
 TimestampPool::~TimestampPool() {
diff --git a/src/timestamp_pool.hh b/src/timestamp_pool.hh
index 681c9e2..b7aa54e 100644
--- a/src/timestamp_pool.hh
+++ b/src/timestamp_pool.hh
@@ -18,6 +18,8 @@
 #include <unordered_set>
 #include <vector>
 
+#include "device_context.hh"
+
 namespace low_latency {
 
 class QueueContext;
@@ -58,6 +60,7 @@ class TimestampPool final {
         friend class TimestampPool;
 
       private:
+        const TimestampPool& timestamp_pool;
         const std::weak_ptr<QueryChunk> origin_chunk;
 
       public:
@@ -66,7 +69,8 @@ class TimestampPool final {
         const VkCommandBuffer command_buffer;
 
       public:
-        Handle(const std::shared_ptr<QueryChunk>& origin_chunk,
+        Handle(const TimestampPool& timestamp_pool,
+               const std::shared_ptr<QueryChunk>& origin_chunk,
                const std::uint64_t& query_index);
         Handle(const Handle& handle) = delete;
         Handle(Handle&&) = delete;
@@ -78,7 +82,7 @@ class TimestampPool final {
         void setup_command_buffers(const Handle& tail,
                                    const QueueContext& queue_context) const;
 
-        std::optional<std::uint64_t> get_ticks(const TimestampPool& pool);
+        DeviceContext::Clock::time_point_t get_time();
     };
 
   public:
author	Nicolas James <nj3ahxac@gmail.com>	2026-02-18 14:51:21 +1100
committer	Nicolas James <nj3ahxac@gmail.com>	2026-02-18 14:51:21 +1100
commit	bb7c9b56e1710a7f2a3f4bb57f181e4fa196aba2 (patch)
tree	e9de8b88afcb339af9a7c87156511f6cc238f829
parent	b7414a760c183d4987090c110ddacca277983e17 (diff)