From 3a5299c81884e8b28fa6a1a57f31c3375a4b633d Mon Sep 17 00:00:00 2001
From: Nicolas James <nj3ahxac@gmail.com>
Date: Thu, 19 Feb 2026 13:16:14 +1100
Subject: Don't mess with timeline semaphores in submit, spin on
 vkGetQueryPoolResults instead, fix start = end submission issue

---
 src/layer.cc          | 158 ++++++++++++++++++++------------------------------
 src/queue_context.cc  | 152 ++++++++++--------------------------------------
 src/queue_context.hh  |  10 +---
 src/timestamp_pool.cc |  63 +++++++++++++++-----
 src/timestamp_pool.hh |  20 ++++++-
 5 files changed, 164 insertions(+), 239 deletions(-)

(limited to 'src')
diff --git a/src/layer.cc b/src/layer.cc
index 160851f..c19fbfc 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -282,8 +282,6 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
         DEVICE_VTABLE_LOAD(DestroyDevice),
         DEVICE_VTABLE_LOAD(GetDeviceQueue),
         DEVICE_VTABLE_LOAD(QueueSubmit),
-        DEVICE_VTABLE_LOAD(CreateSemaphore),
-        DEVICE_VTABLE_LOAD(DestroySemaphore),
         DEVICE_VTABLE_LOAD(CreateQueryPool),
         DEVICE_VTABLE_LOAD(DestroyQueryPool),
         DEVICE_VTABLE_LOAD(GetQueryPoolResults),
@@ -302,8 +300,6 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
         DEVICE_VTABLE_LOAD(AcquireNextImageKHR),
         DEVICE_VTABLE_LOAD(QueuePresentKHR),
         DEVICE_VTABLE_LOAD(AcquireNextImage2KHR),
-        DEVICE_VTABLE_LOAD(GetSemaphoreCounterValueKHR),
-        DEVICE_VTABLE_LOAD(WaitSemaphoresKHR),
         DEVICE_VTABLE_LOAD(CmdWriteTimestamp2KHR),
         DEVICE_VTABLE_LOAD(QueueSubmit2KHR),
         DEVICE_VTABLE_LOAD(GetCalibratedTimestampsKHR),
@@ -453,55 +449,44 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
         return vtable.QueueSubmit(queue, submit_count, submit_infos, fence);
     }
 
-    // We have to avoid casting away the const* of the passed VkSubmitInfos.
-    // So we end up copying a lot of stuff and wrapping them in unique_ptrs
-    // so their position in memory is stable.
-
-    using cb_vect = std::vector<VkCommandBuffer>;
-    using tssi_t = VkTimelineSemaphoreSubmitInfo;
+    using cbs_t = std::vector<VkCommandBuffer>;
     auto next_submits = std::vector<VkSubmitInfo>{};
-    auto next_cbs = std::vector<std::unique_ptr<cb_vect>>{};
-    auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{};
-    auto tssis = std::vector<std::unique_ptr<tssi_t>>{};
 
-    for (const auto& submit_info : std::span{submit_infos, submit_count}) {
-        const auto head_handle = queue_context->timestamp_pool->acquire();
-        const auto tail_handle = queue_context->timestamp_pool->acquire();
+    // We're making modifications to multiple vkQueueSubmits. These have raw
+    // pointers to our command buffer arrays - of which the position in memory
+    // of can change on vector reallocation. So we use unique_ptrs here.
+    auto next_cbs = std::vector<std::unique_ptr<cbs_t>>{};
+
+    // notify_submit() should take copies of these shared_ptrs and store
+    // them for the duration of our call, but saving them here is a bit
+    // more explicit + insurance if that changes.
+    auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{};
 
-        next_cbs.emplace_back([&]() -> auto {
-            auto cbs = std::make_unique<std::vector<VkCommandBuffer>>();
+    std::ranges::transform(
+        std::span{submit_infos, submit_count}, std::back_inserter(next_submits),
+        [&](const auto& submit) {
+            const auto head_handle = queue_context->timestamp_pool->acquire();
+            const auto tail_handle = queue_context->timestamp_pool->acquire();
             head_handle->setup_command_buffers(*tail_handle, *queue_context);
-            cbs->push_back(head_handle->command_buffer);
-            std::ranges::copy_n(submit_info.pCommandBuffers,
-                                submit_info.commandBufferCount,
-                                std::back_inserter(*cbs));
-            cbs->push_back(tail_handle->command_buffer);
-            return cbs;
-        }());
-        next_submits.push_back(submit_info);
-        next_submits.back().pCommandBuffers = std::data(*next_cbs.back());
-        next_submits.back().commandBufferCount = std::size(*next_cbs.back());
-        handles.push_back(head_handle);
-        handles.push_back(tail_handle);
-
-        // We submit an extra command which signals a timeline semaphore which
-        // signals that this command has completed.
-        const auto sequence = 1 + queue_context->semaphore_sequence++;
-        queue_context->notify_submit(submit_info, sequence, head_handle,
-                                     tail_handle);
-
-        tssis.push_back(std::make_unique<tssi_t>(tssi_t{
-            .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR,
-            .signalSemaphoreValueCount = 1,
-            .pSignalSemaphoreValues = &sequence,
-        }));
-        next_submits.push_back(VkSubmitInfo{
-            .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
-            .pNext = tssis.back().get(),
-            .signalSemaphoreCount = 1,
-            .pSignalSemaphores = &queue_context->semaphore,
+            queue_context->notify_submit(submit, head_handle, tail_handle);
+
+            handles.emplace_back(head_handle);
+            handles.emplace_back(tail_handle);
+            next_cbs.emplace_back([&]() -> auto {
+                auto cbs = std::make_unique<cbs_t>();
+                cbs->push_back(head_handle->command_buffer);
+                std::ranges::copy_n(submit.pCommandBuffers,
+                                    submit.commandBufferCount,
+                                    std::back_inserter(*cbs));
+                cbs->push_back(tail_handle->command_buffer);
+                return cbs;
+            }());
+
+            auto next_submit = submit;
+            next_submit.pCommandBuffers = std::data(*next_cbs.back());
+            next_submit.commandBufferCount = std::size(*next_cbs.back());
+            return next_submit;
         });
-    }
 
     return vtable.QueueSubmit(queue, std::size(next_submits),
                               std::data(next_submits), fence);
@@ -519,57 +504,42 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
         return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
     }
 
-    using cb_vect_t = std::vector<VkCommandBufferSubmitInfo>;
+    using cbs_t = std::vector<VkCommandBufferSubmitInfo>;
     auto next_submits = std::vector<VkSubmitInfo2>{};
-    auto next_cbs = std::vector<std::unique_ptr<cb_vect_t>>{};
+    auto next_cbs = std::vector<std::unique_ptr<cbs_t>>{};
     auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{};
-    auto next_ssis = std::vector<std::unique_ptr<VkSemaphoreSubmitInfo>>{};
-
-    for (const auto& submit_info : std::span{submit_infos, submit_count}) {
-        const auto head_handle = queue_context->timestamp_pool->acquire();
-        const auto tail_handle = queue_context->timestamp_pool->acquire();
 
-        next_cbs.emplace_back([&]() -> auto {
-            auto cbs = std::make_unique<cb_vect_t>();
+    std::ranges::transform(
+        std::span{submit_infos, submit_count}, std::back_inserter(next_submits),
+        [&](const auto& submit) {
+            const auto head_handle = queue_context->timestamp_pool->acquire();
+            const auto tail_handle = queue_context->timestamp_pool->acquire();
             head_handle->setup_command_buffers(*tail_handle, *queue_context);
-            cbs->push_back(VkCommandBufferSubmitInfo{
-                .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
-                .commandBuffer = head_handle->command_buffer,
-            });
-            std::ranges::copy_n(submit_info.pCommandBufferInfos,
-                                submit_info.commandBufferInfoCount,
-                                std::back_inserter(*cbs));
-            cbs->push_back(VkCommandBufferSubmitInfo{
-                .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
-                .commandBuffer = tail_handle->command_buffer,
-            });
-            return cbs;
-        }());
-
-        next_submits.push_back(submit_info);
-        next_submits.back().pCommandBufferInfos = std::data(*next_cbs.back());
-        next_submits.back().commandBufferInfoCount =
-            std::size(*next_cbs.back());
-        handles.push_back(head_handle);
-        handles.push_back(tail_handle);
-
-        const auto sequence = 1 + queue_context->semaphore_sequence++;
-        queue_context->notify_submit(submit_info, sequence, head_handle,
-                                     tail_handle);
-
-        next_ssis.push_back(
-            std::make_unique<VkSemaphoreSubmitInfo>(VkSemaphoreSubmitInfo{
-                .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
-                .semaphore = queue_context->semaphore,
-                .value = sequence,
-                .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
-            }));
-        next_submits.push_back(VkSubmitInfo2{
-            .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
-            .signalSemaphoreInfoCount = 1,
-            .pSignalSemaphoreInfos = next_ssis.back().get(),
+            queue_context->notify_submit(submit, head_handle, tail_handle);
+
+            next_cbs.emplace_back([&]() -> auto {
+                auto cbs = std::make_unique<cbs_t>();
+                head_handle->setup_command_buffers(*tail_handle,
+                                                   *queue_context);
+                cbs->push_back(VkCommandBufferSubmitInfo{
+                    .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+                    .commandBuffer = head_handle->command_buffer,
+                });
+                std::ranges::copy_n(submit.pCommandBufferInfos,
+                                    submit.commandBufferInfoCount,
+                                    std::back_inserter(*cbs));
+                cbs->push_back(VkCommandBufferSubmitInfo{
+                    .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+                    .commandBuffer = tail_handle->command_buffer,
+                });
+                return cbs;
+            }());
+
+            auto next_submit = submit;
+            next_submit.pCommandBufferInfos = std::data(*next_cbs.back());
+            next_submit.commandBufferInfoCount = std::size(*next_cbs.back());
+            return next_submit;
         });
-    }
 
     return vtable.QueueSubmit2(queue, std::size(next_submits),
                                std::data(next_submits), fence);
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 3914691..2f0a89d 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -28,25 +28,6 @@ make_command_pool(const DeviceContext& device_context,
     return command_pool;
 }
 
-static VkSemaphore make_semaphore(const DeviceContext& device_context) {
-
-    const auto stci = VkSemaphoreTypeCreateInfo{
-        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
-        .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE,
-        .initialValue = 0,
-    };
-
-    const auto sci = VkSemaphoreCreateInfo{
-        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
-        .pNext = &stci,
-    };
-
-    auto semaphore = VkSemaphore{};
-    device_context.vtable.CreateSemaphore(device_context.device, &sci, nullptr,
-                                          &semaphore);
-    return semaphore;
-}
-
 QueueContext::QueueContext(DeviceContext& device_context, const VkQueue& queue,
                            const std::uint32_t& queue_family_index)
     : device_context(device_context), queue(queue),
@@ -54,7 +35,6 @@ QueueContext::QueueContext(DeviceContext& device_context, const VkQueue& queue,
       // Important we make the command pool before the timestamp pool, because
       // it's a dependency.
       command_pool(make_command_pool(device_context, queue_family_index)),
-      semaphore(make_semaphore(device_context)),
       timestamp_pool(std::make_unique<TimestampPool>(*this)) {}
 
 QueueContext::~QueueContext() {
@@ -64,14 +44,12 @@ QueueContext::~QueueContext() {
     this->timestamp_pool.reset();
 
     const auto& vtable = this->device_context.vtable;
-    vtable.DestroySemaphore(this->device_context.device, this->semaphore,
-                            nullptr);
     vtable.DestroyCommandPool(this->device_context.device, this->command_pool,
                               nullptr);
 }
 
 void QueueContext::notify_submit(
-    const VkSubmitInfo& info, const std::uint64_t& sequence,
+    const VkSubmitInfo& info,
     const std::shared_ptr<TimestampPool::Handle> head_handle,
     const std::shared_ptr<TimestampPool::Handle> tail_handle) {
 
@@ -92,9 +70,8 @@ void QueueContext::notify_submit(
         std::cerr << "      " << wait << '\n';
     }
 
-    this->submissions.emplace_back(
-        std::make_unique<Submission>(std::move(signals), std::move(waits),
-                                     head_handle, tail_handle, sequence));
+    this->submissions.emplace_back(std::make_unique<Submission>(
+        std::move(signals), std::move(waits), head_handle, tail_handle));
 
     // TODO HACK
     if (std::size(this->submissions) > 100) {
@@ -103,7 +80,7 @@ void QueueContext::notify_submit(
 }
 
 void QueueContext::notify_submit(
-    const VkSubmitInfo2& info, const std::uint64_t& sequence,
+    const VkSubmitInfo2& info,
     const std::shared_ptr<TimestampPool::Handle> head_handle,
     const std::shared_ptr<TimestampPool::Handle> tail_handle) {
 
@@ -130,9 +107,8 @@ void QueueContext::notify_submit(
         std::cerr << "      " << wait << '\n';
     }
 
-    this->submissions.emplace_back(
-        std::make_unique<Submission>(std::move(signals), std::move(waits),
-                                     head_handle, tail_handle, sequence));
+    this->submissions.emplace_back(std::make_unique<Submission>(
+        std::move(signals), std::move(waits), head_handle, tail_handle));
 
     // TODO HACK
     if (std::size(this->submissions) > 100) {
@@ -165,7 +141,7 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
 
     // The last submission is either in flight, already processed, or we
     // just happen to be the first frame and we can just set it to our start
-    // with little conseuqence.
+    // with little consequence.
     const auto prev_frame_last_submit = [&]() -> auto {
         if (const auto iter = std::rbegin(this->in_flight_frames);
             iter != std::rend(this->in_flight_frames)) {
@@ -189,7 +165,6 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
     this->in_flight_frames.emplace_back(Frame{
         .prev_frame_last_submit = prev_frame_last_submit,
         .submissions = std::move(this->submissions),
-        .sequence = (*last_iter)->sequence,
     });
     assert(std::size(this->in_flight_frames.back().submissions));
     // *valid but unspecified state after move, so clear!*
@@ -211,22 +186,12 @@ void QueueContext::process_frames() {
 
     // We used to collect all devices that were pointed to by all potential
     // submissions, put them in a set and then call.calibrate() on each once.
-    // This is unnecessary now - we can assume all submissions come from the
-    // same queue (this one!).
+    // This is unnecessary now - we assume all submissions come from the same
+    // queue. FIXME: don't assume this.
     auto& device_context = this->device_context;
     auto& clock = device_context.clock;
     clock.calibrate();
 
-    // Get the queue's sequence number so we can quickly check
-    // frames are finished without calling getCalibratedTimestamps.
-    // This is somewhat a premature optimization but it's elegant.
-    const auto seq = [&, this]() -> auto {
-        auto seq = std::uint64_t{0};
-        device_context.vtable.GetSemaphoreCounterValueKHR(
-            device_context.device, this->semaphore, &seq);
-        return seq;
-    }();
-
     while (std::size(this->in_flight_frames)) {
         const auto& frame = this->in_flight_frames.front();
 
@@ -237,7 +202,7 @@ void QueueContext::process_frames() {
         const auto& last_submission = frame.submissions.back();
 
         // Not completed (so future frames definitely aren't) - stop early.
-        if (seq < last_submission->sequence) {
+        if (!last_submission->end_handle->get_time().has_value()) {
             break;
         }
 
@@ -256,13 +221,9 @@ void QueueContext::process_frames() {
             std::ranges::transform(
                 frame.submissions, std::back_inserter(intervals),
                 [&, this](const auto& submission) {
-                    const auto get_time = [&, this](const auto& handle) {
-                        return handle->get_time();
-                    };
-
                     return Interval{
-                        .start = get_time(submission->start_handle),
-                        .end = get_time(submission->end_handle),
+                        .start = submission->start_handle->get_time_required(),
+                        .end = submission->end_handle->get_time_required(),
                     };
                 });
 
@@ -309,7 +270,8 @@ void QueueContext::process_frames() {
                 return gputime + (end - start);
             });
 
-        const auto start = frame.prev_frame_last_submit->end_handle->get_time();
+        const auto start =
+            frame.prev_frame_last_submit->end_handle->get_time_required();
         const auto end = merged.back().end;
         const auto not_gputime = (end - start) - gputime;
 
@@ -341,32 +303,24 @@ void QueueContext::sleep_in_present() {
     // frames*.
     this->process_frames();
 
-    if (const auto F = std::size(this->in_flight_frames); F > 1) {
-        // In this case, we are so far ahead that there are multiple frames
-        // in flight. Either that, or our bookkeeping has gone horribly
-        // wrong! Wait on the 2nd last frame in flight to complete. This
-        // shunts us to F=1.
-        const auto second_iter = std::next(std::rbegin(this->in_flight_frames));
-        assert(second_iter != std::rend(this->in_flight_frames));
-
-        const auto swi = VkSemaphoreWaitInfo{
-            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
-            .semaphoreCount = 1,
-            .pSemaphores = &this->semaphore,
-            .pValues = &second_iter->sequence,
-        };
-        vtable.WaitSemaphoresKHR(device.device, &swi,
-                                 std::numeric_limits<std::uint64_t>::max());
-
-        // Here
-        this->process_frames(); // get rid of completed frames
-    } else if (!F) {
-        // We have completed all frames. DO NOT WAIT!
+    if (!std::size(this->in_flight_frames)) {
         return;
     }
 
-    // We are checking size again because process_frames might have drained
-    // it to zero.
+    // This is doing more than it looks like one line can do (tbf it is a long
+    // line). It's getting the most recent frame and waiting until its start has
+    // begun. This means that, in the case of >1 frame in flight, it's draining
+    // all of them before we're allowed to move forward.
+    const auto a = this->in_flight_frames.back()
+                       .submissions.front()
+                       ->start_handle->get_time_spinlock();
+
+    // Process frames because as stated above, we might have multiple frames
+    // now completed.
+    this->process_frames();
+
+    // Check the size again because the frame we want to target may have already
+    // completed when we called process_frames().
     if (!std::size(this->in_flight_frames)) {
         return;
     }
@@ -397,56 +351,12 @@ void QueueContext::sleep_in_present() {
     std::cerr << "    expected not_gputime: ";
     debug_log_time(expected_not_gputime);
 
-    //                               PRESENT CALL
-    // |----------------------------------|----------------|
-    // first                              b                c
-    //
-    // Us, the CPU on the host, is approximately at 'b'. We have a good
-    // guess for the distance between a and b as gputime.
-
-    const auto& frame = this->in_flight_frames.back();
-
-    // We could be in the period where A hasn't signalled yet.
-    // It's impossible to make a decision until we know a.
-    // Doing this is fine because it won't affect throughput at all.
-    // (ie, there's more work queued after regardless).
-    // FIXME: If a == b, then we're waiting for the entire queue
-    // to finish because the semaphore only says if it has finished.
-    // The fix is to check the start timestamp instead of the query
-    // in the case that it's...
-    // Honestly it might be better to signal two semaphores because
-    // we need to wait for when the submission starts work and
-    // right now, we only signal when the submission finishes work.
-    // Ideally we have both, so we can elegantly wait on the start
-    // semaphore of A, then get A's start timestamp. This is BROKEN.
-
-    [&]() -> void {
-        const auto swi = VkSemaphoreWaitInfo{
-            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
-            .semaphoreCount = 1,
-            .pSemaphores = &this->semaphore,
-            .pValues = &frame.submissions.front()->sequence,
-        };
-        vtable.WaitSemaphoresKHR(device.device, &swi,
-                                 std::numeric_limits<std::uint64_t>::max());
-    }();
-
-    // We now know that A is available because its semaphore has been
-    // signalled.
-    const auto a = frame.submissions.front()->start_handle->get_time();
-
     const auto now = std::chrono::steady_clock::now();
     const auto dist = now - a;
     const auto expected = expected_gputime - dist;
 
-    const auto swi = VkSemaphoreWaitInfo{
-        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
-        .semaphoreCount = 1,
-        .pSemaphores = &this->semaphore,
-        .pValues = &frame.sequence,
-    };
-    vtable.WaitSemaphoresKHR(device.device, &swi,
-                             std::max(expected.count(), 0l));
+    const auto& frame = this->in_flight_frames.back();
+    frame.submissions.back()->end_handle->get_time_spinlock(now + expected);
 }
 
 } // namespace low_latency
\ No newline at end of file
diff --git a/src/queue_context.hh b/src/queue_context.hh
index 219e6fb..f8782de 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -22,9 +22,6 @@ class QueueContext final : public Context {
     const VkQueue queue;
     const std::uint32_t queue_family_index;
 
-    std::uint64_t semaphore_sequence = 0;
-    VkSemaphore semaphore;
-
     VkCommandPool command_pool;
 
     std::unique_ptr<TimestampPool> timestamp_pool;
@@ -40,8 +37,6 @@ class QueueContext final : public Context {
         const std::shared_ptr<TimestampPool::Handle> start_handle;
         const std::shared_ptr<TimestampPool::Handle> end_handle;
 
-        std::uint64_t sequence;
-
         std::string debug;
     };
     using submission_ptr_t = std::shared_ptr<Submission>;
@@ -55,7 +50,6 @@ class QueueContext final : public Context {
     struct Frame {
         submission_ptr_t prev_frame_last_submit;
         std::deque<submission_ptr_t> submissions;
-        std::uint64_t sequence;
     };
     std::deque<Frame> in_flight_frames;
 
@@ -77,12 +71,12 @@ class QueueContext final : public Context {
 
   public:
     void
-    notify_submit(const VkSubmitInfo& info, const std::uint64_t& sequence,
+    notify_submit(const VkSubmitInfo& info,
                   const std::shared_ptr<TimestampPool::Handle> head_handle,
                   const std::shared_ptr<TimestampPool::Handle> tail_handle);
 
     void
-    notify_submit(const VkSubmitInfo2& info, const std::uint64_t& sequence,
+    notify_submit(const VkSubmitInfo2& info,
                   const std::shared_ptr<TimestampPool::Handle> head_handle,
                   const std::shared_ptr<TimestampPool::Handle> tail_handle);
 
diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc
index 854fae1..a66bb2a 100644
--- a/src/timestamp_pool.cc
+++ b/src/timestamp_pool.cc
@@ -2,7 +2,9 @@
 #include "device_context.hh"
 #include "queue_context.hh"
 
+#include <chrono>
 #include <ranges>
+#include <thread>
 #include <vulkan/utility/vk_dispatch_table.h>
 #include <vulkan/vulkan_core.h>
 
@@ -123,37 +125,68 @@ void TimestampPool::Handle::setup_command_buffers(
     vtable.EndCommandBuffer(tail.command_buffer);
 }
 
-DeviceContext::Clock::time_point_t TimestampPool::Handle::get_time() {
+std::optional<DeviceContext::Clock::time_point_t>
+TimestampPool::Handle::get_time() {
     const auto& device_ctx = this->timestamp_pool.queue_context.device_context;
     const auto& vtable = device_ctx.vtable;
 
-    // For debug builds, we're going to query the availability bit so we can
-    // assert that after the semaphore has flagged it as naturally available.
     struct QueryResult {
         std::uint64_t value;
-#ifndef NDEBUG
         std::uint64_t available;
-#endif
     };
     auto query_result = QueryResult{};
 
-    constexpr auto query_flags = []() -> auto {
-        auto flag = VkQueryResultFlags{VK_QUERY_RESULT_64_BIT};
-#ifndef NDEBUG
-        flag |= VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
-#endif
-        return flag;
-    }();
-
     const auto r = vtable.GetQueryPoolResults(
         device_ctx.device, query_pool, this->query_index, 1,
-        sizeof(query_result), &query_result, sizeof(query_result), query_flags);
+        sizeof(query_result), &query_result, sizeof(query_result),
+        VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT);
 
-    assert(r == VK_SUCCESS && query_result.available);
+    assert(r == VK_SUCCESS || r == VK_NOT_READY);
+
+    if (!query_result.available) {
+        return std::nullopt;
+    }
 
     return device_ctx.clock.ticks_to_time(query_result.value);
 }
 
+std::optional<DeviceContext::Clock::time_point_t>
+TimestampPool::Handle::get_time_spinlock(
+    const DeviceContext::Clock::time_point_t& until) {
+    
+    auto time = this->get_time();
+    if (time.has_value()) { // fast path, avoid now().
+        return time;
+    }
+    
+    auto last = std::chrono::steady_clock::now();
+    for (; !time.has_value(); time = this->get_time()) {
+        
+        if (const auto now = std::chrono::steady_clock::now(); now >= until) {
+            break;
+        }
+        
+        // Afaik no-op if it's too far behind, which is ideal.
+        std::this_thread::sleep_until(std::min(last + this->SPINLOCK_MAX_DELAY, until));
+
+        last = std::chrono::steady_clock::now();
+    }
+            
+    return time;
+}
+
+DeviceContext::Clock::time_point_t TimestampPool::Handle::get_time_spinlock() {
+    const auto time = this->get_time_spinlock(DeviceContext::Clock::time_point_t::max());
+    assert(time.has_value());
+    return *time;
+}
+
+DeviceContext::Clock::time_point_t TimestampPool::Handle::get_time_required() {
+    const auto time = this->get_time();
+    assert(time.has_value());
+    return *time;
+}
+
 TimestampPool::~TimestampPool() {
     const auto& device = this->queue_context.device_context.device;
     const auto& vtable = this->queue_context.device_context.vtable;
diff --git a/src/timestamp_pool.hh b/src/timestamp_pool.hh
index b7aa54e..bfdad2e 100644
--- a/src/timestamp_pool.hh
+++ b/src/timestamp_pool.hh
@@ -59,6 +59,11 @@ class TimestampPool final {
       private:
         friend class TimestampPool;
 
+      private:
+        // For our spinlock functions this is the period in which we sleep
+        // between attempts.
+        static constexpr auto SPINLOCK_MAX_DELAY = std::chrono::microseconds(1);
+
       private:
         const TimestampPool& timestamp_pool;
         const std::weak_ptr<QueryChunk> origin_chunk;
@@ -82,7 +87,20 @@ class TimestampPool final {
         void setup_command_buffers(const Handle& tail,
                                    const QueueContext& queue_context) const;
 
-        DeviceContext::Clock::time_point_t get_time();
+        // Attempts to get_time, but returns an optional if it's not available
+        // yet.
+        std::optional<DeviceContext::Clock::time_point_t> get_time();
+
+        // Calls get_time() repeatedly under a spinlock, or gives up at
+        // time_point_t and returns std::nullopt.
+        std::optional<DeviceContext::Clock::time_point_t>
+        get_time_spinlock(const DeviceContext::Clock::time_point_t& until);
+
+        // Calls get_time() repeatedly under a spinlock until it's available.
+        DeviceContext::Clock::time_point_t get_time_spinlock();
+
+        // Calls get_time with the assumption it's already available.
+        DeviceContext::Clock::time_point_t get_time_required();
     };
 
   public:
-- 
cgit v1.2.3