From 79c9f28f804b591a2981f9d7c224747db906b1e8 Mon Sep 17 00:00:00 2001 From: Nicolas James Date: Thu, 19 Feb 2026 17:02:40 +1100 Subject: WIP timing improvements --- src/layer.cc | 34 +++++++++++++++++++++-- src/queue_context.cc | 78 ++++++++++++++++++++++++++++++++++------------------ src/queue_context.hh | 27 ++++++++++-------- 3 files changed, 99 insertions(+), 40 deletions(-) diff --git a/src/layer.cc b/src/layer.cc index c19fbfc..c49e6a0 100644 --- a/src/layer.cc +++ b/src/layer.cc @@ -449,6 +449,32 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count, return vtable.QueueSubmit(queue, submit_count, submit_infos, fence); } + // What's happening here? + // We are making a very modest modification to all vkQueueSubmits where we + // inject a start and end timestamp query command buffer that writes when + // the GPU started and finished work for each submission. It's important to + // note that we deliberately do *NOT* use or modify any semaphores + // as a mechanism to signal completion or the availability of these submits + // for multiple reasons: + // 1. Modifying semaphores (particuarly in vkQueueSubmit1) is ANNOYING + // done correctly. The pNext chain is const and difficult to modify + // without traversing the entire thing and doing surgical deep copies + // and patches for multiple pNext's sType's. It's easier to leave it + // alone. + // 2. Semaphores only signal at the end of their work, so we cannot use + // them as a mechanism to know if work has started without doing + // another dummy submission. This adds complexity and also skews our + // timestamps slightly. + // 3. Semaphores can be waited which sounds nice in theory, but in my + // own testing waiting on semaphores can cause scheduling issues and + // cause wakeups as late as 1ms from when it was signalled, which is + // unbelievably bad if we're trying to do frame pacing. This means + // we are going to have to do a spinlock poll anyway. + // 4. Guess what info we need? Timestamp information. Guess what + // supports polling of an availability bit? Timestamp information. + // Why bother with semaphores at all then? Polling a semaphore might + // be faster, but the difference appears to be negligible. + using cbs_t = std::vector; auto next_submits = std::vector{}; @@ -462,13 +488,15 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count, // more explicit + insurance if that changes. auto handles = std::vector>{}; + const auto now = std::chrono::steady_clock::now(); + std::ranges::transform( std::span{submit_infos, submit_count}, std::back_inserter(next_submits), [&](const auto& submit) { const auto head_handle = queue_context->timestamp_pool->acquire(); const auto tail_handle = queue_context->timestamp_pool->acquire(); head_handle->setup_command_buffers(*tail_handle, *queue_context); - queue_context->notify_submit(submit, head_handle, tail_handle); + queue_context->notify_submit(submit, head_handle, tail_handle, now); handles.emplace_back(head_handle); handles.emplace_back(tail_handle); @@ -508,6 +536,8 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count, auto next_submits = std::vector{}; auto next_cbs = std::vector>{}; auto handles = std::vector>{}; + + const auto now = std::chrono::steady_clock::now(); std::ranges::transform( std::span{submit_infos, submit_count}, std::back_inserter(next_submits), @@ -515,7 +545,7 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count, const auto head_handle = queue_context->timestamp_pool->acquire(); const auto tail_handle = queue_context->timestamp_pool->acquire(); head_handle->setup_command_buffers(*tail_handle, *queue_context); - queue_context->notify_submit(submit, head_handle, tail_handle); + queue_context->notify_submit(submit, head_handle, tail_handle, now); next_cbs.emplace_back([&]() -> auto { auto cbs = std::make_unique(); diff --git a/src/queue_context.cc b/src/queue_context.cc index 2f0a89d..e210685 100644 --- a/src/queue_context.cc +++ b/src/queue_context.cc @@ -5,7 +5,6 @@ #include #include #include -#include #include #include @@ -51,7 +50,8 @@ QueueContext::~QueueContext() { void QueueContext::notify_submit( const VkSubmitInfo& info, const std::shared_ptr head_handle, - const std::shared_ptr tail_handle) { + const std::shared_ptr tail_handle, + const DeviceContext::Clock::time_point_t& now) { auto signals = std::unordered_set{}; auto waits = std::unordered_set{}; @@ -71,7 +71,7 @@ void QueueContext::notify_submit( } this->submissions.emplace_back(std::make_unique( - std::move(signals), std::move(waits), head_handle, tail_handle)); + std::move(signals), std::move(waits), head_handle, tail_handle, now)); // TODO HACK if (std::size(this->submissions) > 100) { @@ -82,7 +82,8 @@ void QueueContext::notify_submit( void QueueContext::notify_submit( const VkSubmitInfo2& info, const std::shared_ptr head_handle, - const std::shared_ptr tail_handle) { + const std::shared_ptr tail_handle, + const DeviceContext::Clock::time_point_t& now) { auto signals = std::unordered_set{}; auto waits = std::unordered_set{}; @@ -108,7 +109,7 @@ void QueueContext::notify_submit( } this->submissions.emplace_back(std::make_unique( - std::move(signals), std::move(waits), head_handle, tail_handle)); + std::move(signals), std::move(waits), head_handle, tail_handle, now)); // TODO HACK if (std::size(this->submissions) > 100) { @@ -162,10 +163,9 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) { return *start_iter; }(); - this->in_flight_frames.emplace_back(Frame{ - .prev_frame_last_submit = prev_frame_last_submit, - .submissions = std::move(this->submissions), - }); + this->in_flight_frames.emplace_back( + Frame{.submissions = std::move(this->submissions), + .cpu_post_present_time = std::chrono::steady_clock::now()}); assert(std::size(this->in_flight_frames.back().submissions)); // *valid but unspecified state after move, so clear!* this->submissions.clear(); @@ -270,14 +270,27 @@ void QueueContext::process_frames() { return gputime + (end - start); }); - const auto start = - frame.prev_frame_last_submit->end_handle->get_time_required(); - const auto end = merged.back().end; - const auto not_gputime = (end - start) - gputime; + // Our cpu_start value here refers to the time when the CPU was allowed + // to move past the present call and, in theory, begin cpu work on the + // next frame. + const auto cpu_start = [&]() -> auto { + if (const auto it = std::rbegin(this->timings); + it != std::rend(this->timings)) { + return (*it)->frame.cpu_post_present_time; + } + // This will happen *once*, and only for the first frame. We don't + // have a way of knowing when the CPU first started work obviously + // in this case because we're a vulkan layer and not omniscient. + // Just return our first submit's start for this edge case. + return frame.submissions.front()->start_handle->get_time_required(); + }(); + + const auto cputime = + frame.submissions.front()->enqueued_time - cpu_start; auto timing = Timing{ .gputime = gputime, - .not_gputime = not_gputime, + .cputime = cputime, .frame = frame, }; this->timings.emplace_back(std::make_unique(timing)); @@ -307,13 +320,14 @@ void QueueContext::sleep_in_present() { return; } - // This is doing more than it looks like one line can do (tbf it is a long - // line). It's getting the most recent frame and waiting until its start has + // This is getting the most recent frame and waiting until its start has // begun. This means that, in the case of >1 frame in flight, it's draining // all of them before we're allowed to move forward. - const auto a = this->in_flight_frames.back() - .submissions.front() - ->start_handle->get_time_spinlock(); + const auto first_gpu_work = [&]() -> auto { + const auto& most_recent_frame = this->in_flight_frames.back(); + const auto& first_submission = most_recent_frame.submissions.front(); + return first_submission->start_handle->get_time_spinlock(); + }(); // Process frames because as stated above, we might have multiple frames // now completed. @@ -343,20 +357,30 @@ void QueueContext::sleep_in_present() { const auto expected_gputime = calc_median([](const auto& timing) { return timing->gputime; }); - const auto expected_not_gputime = - calc_median([](const auto& timing) { return timing->not_gputime; }); + const auto expected_cputime = + calc_median([](const auto& timing) { return timing->cputime; }); std::cerr << " expected gputime: "; debug_log_time(expected_gputime); - std::cerr << " expected not_gputime: "; - debug_log_time(expected_not_gputime); + std::cerr << " expected cputime: "; + debug_log_time(expected_cputime); + + // Should look like this: + // total_length = expected_gputime + // |------------------------x------------------------------| + // ^ first_gpu_work now last_gpu_work ^ const auto now = std::chrono::steady_clock::now(); - const auto dist = now - a; - const auto expected = expected_gputime - dist; + const auto dist = now - first_gpu_work; + const auto expected_dist_to_last = expected_gputime - dist; + + const auto wait_time = expected_dist_to_last - expected_cputime; + + auto& frame = this->in_flight_frames.back(); + const auto& last_gpu_work = frame.submissions.back()->end_handle; + last_gpu_work->get_time_spinlock(now + wait_time); - const auto& frame = this->in_flight_frames.back(); - frame.submissions.back()->end_handle->get_time_spinlock(now + expected); + frame.cpu_post_present_time = std::chrono::steady_clock::now(); } } // namespace low_latency \ No newline at end of file diff --git a/src/queue_context.hh b/src/queue_context.hh index f8782de..0c5e51f 100644 --- a/src/queue_context.hh +++ b/src/queue_context.hh @@ -36,6 +36,8 @@ class QueueContext final : public Context { const std::shared_ptr start_handle; const std::shared_ptr end_handle; + + const DeviceContext::Clock::time_point_t enqueued_time; std::string debug; }; @@ -48,14 +50,17 @@ class QueueContext final : public Context { // present was called. // std::size(submissions) >= 1 btw struct Frame { - submission_ptr_t prev_frame_last_submit; std::deque submissions; + + // the point that control flow was returned from VkQueuePresentKHR back to the + // application. + DeviceContext::Clock::time_point_t cpu_post_present_time; }; std::deque in_flight_frames; // Completed frames. struct Timing { - DeviceContext::Clock::time_point_t::duration gputime, not_gputime; + DeviceContext::Clock::time_point_t::duration gputime, cputime; Frame frame; }; @@ -70,15 +75,15 @@ class QueueContext final : public Context { virtual ~QueueContext(); public: - void - notify_submit(const VkSubmitInfo& info, - const std::shared_ptr head_handle, - const std::shared_ptr tail_handle); - - void - notify_submit(const VkSubmitInfo2& info, - const std::shared_ptr head_handle, - const std::shared_ptr tail_handle); + void notify_submit(const VkSubmitInfo& info, + const std::shared_ptr head_handle, + const std::shared_ptr tail_handle, + const DeviceContext::Clock::time_point_t& now); + + void notify_submit(const VkSubmitInfo2& info, + const std::shared_ptr head_handle, + const std::shared_ptr tail_handle, + const DeviceContext::Clock::time_point_t& now); void notify_present(const VkPresentInfoKHR& info); -- cgit v1.2.3