diff options
| author | Nicolas James <nj3ahxac@gmail.com> | 2026-02-15 12:52:19 +1100 |
|---|---|---|
| committer | Nicolas James <nj3ahxac@gmail.com> | 2026-02-15 12:52:19 +1100 |
| commit | e0f7daf292db65d8aa492b6bc29ad245a9f83a2d (patch) | |
| tree | 5463fb6a6d2f22dfd6442252301672860823d0d1 | |
| parent | 0deb469d5a7c9a16179139dcff74a54aac1791a0 (diff) | |
Implement anti lag 1 / whatever nvidia reflex equivalent is
| -rw-r--r-- | src/device_context.cc | 5 | ||||
| -rw-r--r-- | src/device_context.hh | 3 | ||||
| -rw-r--r-- | src/layer.cc | 147 | ||||
| -rw-r--r-- | src/queue_context.cc | 429 | ||||
| -rw-r--r-- | src/queue_context.hh | 36 |
5 files changed, 258 insertions, 362 deletions
diff --git a/src/device_context.cc b/src/device_context.cc index f849df1..59d818e 100644 --- a/src/device_context.cc +++ b/src/device_context.cc @@ -2,6 +2,7 @@ #include "queue_context.hh" #include <utility> +#include <iostream> namespace low_latency { @@ -24,6 +25,9 @@ DeviceContext::~DeviceContext() { void DeviceContext::notify_acquire(const VkSwapchainKHR& swapchain, const std::uint32_t& image_index, const VkSemaphore& signal_semaphore) { + + std::cerr << "notify acquire for swapchain: " << swapchain << " : " << image_index << '\n'; + std::cerr << " signal semaphore: " << signal_semaphore << '\n'; const auto it = this->swapchain_signals.try_emplace(swapchain).first; @@ -57,6 +61,7 @@ void DeviceContext::Clock::calibrate() { clock_gettime(CLOCK_MONOTONIC, &tv); return tv.tv_nsec + tv.tv_sec*1000000000ull; */ + const auto steady_before = std::chrono::steady_clock::now(); device.vtable.GetCalibratedTimestampsKHR(device.device, 2, std::data(infos), &calibrated_result.device, diff --git a/src/device_context.hh b/src/device_context.hh index c08cec2..8a86cfb 100644 --- a/src/device_context.hh +++ b/src/device_context.hh @@ -36,7 +36,8 @@ struct DeviceContext final : public Context { struct Clock { public: - using time_point_t = std::chrono::steady_clock::time_point; + using time_point_t = std::chrono::time_point<std::chrono::steady_clock, + std::chrono::nanoseconds>; const DeviceContext& device; public: diff --git a/src/layer.cc b/src/layer.cc index 1b1d9e7..f9917f6 100644 --- a/src/layer.cc +++ b/src/layer.cc @@ -1,5 +1,6 @@ #include "layer.hh" +#include <iostream> #include <memory> #include <span> #include <string_view> @@ -278,7 +279,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice( return result; } - + #define DEVICE_VTABLE_LOAD(name) \ .name = reinterpret_cast<PFN_vk##name>(gdpa(*pDevice, "vk" #name)) auto vtable = VkuDeviceDispatchTable{ @@ -457,23 +458,19 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count, } // We have to avoid casting away the const* of the passed VkSubmitInfos. - // We wrap every single submission with *two* extra VkSubmitInfos to - // accomplish this. The first executes a command buffer that + // So we end up copying a lot of stuff and wrapping them in unique_ptrs + // so their position in memory is stable. using cb_vect = std::vector<VkCommandBuffer>; using tssi_ptr_t = std::unique_ptr<VkTimelineSemaphoreSubmitInfo>; auto next_submits = std::vector<VkSubmitInfo>{}; auto next_cbs = std::vector<std::unique_ptr<cb_vect>>{}; - auto next_signals = std::vector<std::unique_ptr<std::uint64_t>>{}; - auto next_tssis = std::vector<tssi_ptr_t>{}; auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{}; for (const auto& submit_info : std::span{submit_infos, submit_count}) { const auto head_handle = queue_context->timestamp_pool->acquire(); const auto tail_handle = queue_context->timestamp_pool->acquire(); - // Head is special as we need to inject a CB into a copy of - // their command buffers that records the time the waits completed. next_cbs.emplace_back([&]() -> auto { auto cbs = std::make_unique<std::vector<VkCommandBuffer>>(); head_handle->setup_command_buffers(*tail_handle, *queue_context); @@ -488,42 +485,13 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count, next_submits.back().pCommandBuffers = std::data(*next_cbs.back()); next_submits.back().commandBufferCount = std::size(*next_cbs.back()); - const auto next_signal = 1 + queue_context->semaphore_sequence++; - - next_signals.push_back(std::make_unique<std::uint64_t>(next_signal)); - - next_tssis.push_back(std::make_unique<VkTimelineSemaphoreSubmitInfo>( - VkTimelineSemaphoreSubmitInfo{ - .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR, - .signalSemaphoreValueCount = 1, - .pSignalSemaphoreValues = next_signals.back().get(), - })); - next_submits.push_back(VkSubmitInfo{ - .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, - .pNext = next_tssis.back().get(), - .commandBufferCount = 1, - .pCommandBuffers = &tail_handle->command_buffer, - .signalSemaphoreCount = 1, - .pSignalSemaphores = &queue_context->semaphore, - }); - - queue_context->notify_submit(submit_info, next_signal, head_handle, - tail_handle); - + queue_context->notify_submit(submit_info, head_handle, tail_handle); handles.push_back(head_handle); handles.push_back(tail_handle); } - if (const auto res = vtable.QueueSubmit(queue, std::size(next_submits), - std::data(next_submits), fence); - res != VK_SUCCESS) { - - return res; - } - - // ?!? - - return VK_SUCCESS; + return vtable.QueueSubmit(queue, std::size(next_submits), + std::data(next_submits), fence); } // The logic for this function is identical to vkSubmitInfo. @@ -531,71 +499,52 @@ static VKAPI_ATTR VkResult VKAPI_CALL vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count, const VkSubmitInfo2* submit_infos, VkFence fence) { - const auto queue_context = layer_context.get_context(queue); + const auto& queue_context = layer_context.get_context(queue); const auto& vtable = queue_context->device_context.vtable; - // TODO - if (!submit_count || true) { + if (!submit_count) { return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence); } - /* - auto timestamp_handle = queue_context->timestamp_pool->acquire(); - timestamp_handle->setup_command_buffers(vtable); - const auto& [head_cb, tail_cb] = timestamp_handle->command_buffers; - - const auto next_command_buffers = [&]() -> auto { - auto next_command_buffers = std::vector<VkCommandBufferSubmitInfo>{}; - next_command_buffers.push_back(VkCommandBufferSubmitInfo{ - .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, - .commandBuffer = head_cb, - }); - std::ranges::copy_n(submit_infos[0].pCommandBufferInfos, - submit_infos[0].commandBufferInfoCount, - std::back_inserter(next_command_buffers)); - return next_command_buffers; - }(); + using cb_vect_t = std::vector<VkCommandBufferSubmitInfo>; + auto next_submits = std::vector<VkSubmitInfo2>{}; + auto next_cbs = std::vector<std::unique_ptr<cb_vect_t>>{}; + auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{}; - auto next_submit_infos = std::vector<VkSubmitInfo2>(); - std::ranges::copy_n(submit_infos, submit_count, - std::back_inserter(next_submit_infos)); - next_submit_infos[0].pCommandBufferInfos = std::data(next_command_buffers); - next_submit_infos[0].commandBufferInfoCount = - std::size(next_command_buffers); - - const auto target_semaphore_sequence = - 1 + queue_context->semaphore_sequence++; - const auto tail_ssi = VkSemaphoreSubmitInfo{ - .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO, - .semaphore = queue_context->semaphore, - .value = target_semaphore_sequence, - .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, - }; - const auto tail_cbsi = VkCommandBufferSubmitInfo{ - .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, - .commandBuffer = tail_cb, - }; - next_submit_infos.push_back(VkSubmitInfo2{ - .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2, - .commandBufferInfoCount = 1, - .pCommandBufferInfos = &tail_cbsi, - .signalSemaphoreInfoCount = 1, - .pSignalSemaphoreInfos = &tail_ssi, - }); - - if (const auto res = - vtable.QueueSubmit2(queue, std::size(next_submit_infos), - std::data(next_submit_infos), fence); - res != VK_SUCCESS) { - return res; - } + for (const auto& submit_info : std::span{submit_infos, submit_count}) { + const auto head_handle = queue_context->timestamp_pool->acquire(); + const auto tail_handle = queue_context->timestamp_pool->acquire(); + + next_cbs.emplace_back([&]() -> auto { + auto cbs = std::make_unique<cb_vect_t>(); + head_handle->setup_command_buffers(*tail_handle, *queue_context); + cbs->push_back(VkCommandBufferSubmitInfo{ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, + .commandBuffer = head_handle->command_buffer, + }); + std::ranges::copy_n(submit_info.pCommandBufferInfos, + submit_info.commandBufferInfoCount, + std::back_inserter(*cbs)); + cbs->push_back(VkCommandBufferSubmitInfo{ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, + .commandBuffer = tail_handle->command_buffer, + }); + return cbs; + }()); - queue_context->notify_submit({submit_infos, submit_count}, - target_semaphore_sequence, - std::move(timestamp_handle)); + next_submits.push_back(submit_info); + next_submits.back().pCommandBufferInfos = std::data(*next_cbs.back()); + next_submits.back().commandBufferInfoCount = + std::size(*next_cbs.back()); - return VK_SUCCESS; - */ + queue_context->notify_submit(submit_info, head_handle, tail_handle); + + handles.push_back(head_handle); + handles.push_back(tail_handle); + } + + return vtable.QueueSubmit2(queue, std::size(next_submits), + std::data(next_submits), fence); } static VKAPI_ATTR VkResult VKAPI_CALL @@ -621,10 +570,10 @@ vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) { queue_context->notify_present(*present_info); } - if (const auto sleep_time = queue_context->get_delay_time(); - sleep_time.has_value()) { + if (const auto sleep_until = queue_context->get_sleep_until(); + sleep_until.has_value()) { - std::this_thread::sleep_for(*sleep_time); + std::this_thread::sleep_until(*sleep_until); } return VK_SUCCESS; diff --git a/src/queue_context.cc b/src/queue_context.cc index 99cf51e..2b79b53 100644 --- a/src/queue_context.cc +++ b/src/queue_context.cc @@ -2,8 +2,10 @@ #include "device_context.hh" #include "timestamp_pool.hh" +#include <algorithm> #include <chrono> #include <iostream> +#include <span> namespace low_latency { @@ -67,7 +69,7 @@ QueueContext::~QueueContext() { } void QueueContext::notify_submit( - const VkSubmitInfo& info, const std::uint64_t& target_semaphore_sequence, + const VkSubmitInfo& info, const std::shared_ptr<TimestampPool::Handle> head_handle, const std::shared_ptr<TimestampPool::Handle> tail_handle) { @@ -79,8 +81,7 @@ void QueueContext::notify_submit( std::inserter(signals, std::end(signals))); this->submissions.emplace_back(std::make_unique<Submission>( - std::move(signals), std::move(waits), target_semaphore_sequence, - head_handle, tail_handle)); + std::move(signals), std::move(waits), head_handle, tail_handle)); // TODO HACK if (std::size(this->submissions) > 100) { @@ -88,40 +89,42 @@ void QueueContext::notify_submit( } } -/* void QueueContext::notify_submit( - std::span<const VkSubmitInfo2> infos, - const std::uint64_t target_semaphore_sequence, - std::shared_ptr<TimestampPool::Handle>&& handle) { + const VkSubmitInfo2& info, + const std::shared_ptr<TimestampPool::Handle> head_handle, + const std::shared_ptr<TimestampPool::Handle> tail_handle) { auto signals = std::unordered_set<VkSemaphore>{}; auto waits = std::unordered_set<VkSemaphore>{}; - for (const auto& info : infos) { - constexpr auto get_semaphore = [](const auto& semaphore_info) { - return semaphore_info.semaphore; - }; - std::ranges::transform(info.pSignalSemaphoreInfos, - std::next(info.pSignalSemaphoreInfos, - info.signalSemaphoreInfoCount), - std::inserter(signals, std::end(signals)), - get_semaphore); - std::ranges::transform( - info.pWaitSemaphoreInfos, - std::next(info.pWaitSemaphoreInfos, info.waitSemaphoreInfoCount), - std::inserter(waits, std::end(waits)), get_semaphore); + std::ranges::transform( + std::span{info.pWaitSemaphoreInfos, info.waitSemaphoreInfoCount}, + std::inserter(waits, std::end(waits)), + [](const auto& info) -> auto { return info.semaphore; }); + + std::ranges::transform( + std::span{info.pSignalSemaphoreInfos, info.signalSemaphoreInfoCount}, + std::inserter(signals, std::end(signals)), + [](const auto& info) -> auto { return info.semaphore; }); + + std::cerr << "submit2 notif for queue " << this->queue << '\n'; + std::cerr << " signals: \n"; + for (const auto& signal : signals) { + std::cerr << " " << signal << '\n'; + } + std::cerr << " waits: \n"; + for (const auto& wait : waits) { + std::cerr << " " << wait << '\n'; } this->submissions.emplace_back(std::make_unique<Submission>( - std::move(signals), std::move(waits), target_semaphore_sequence, - std::move(handle))); + std::move(signals), std::move(waits), head_handle, tail_handle)); // TODO HACK if (std::size(this->submissions) > 100) { this->submissions.pop_front(); } } -*/ void QueueContext::notify_present(const VkPresentInfoKHR& info) { @@ -153,7 +156,7 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) { return collected_semaphores; }(); - const auto start_iter = std::ranges::find_if( + const auto acquire_iter = std::ranges::find_if( std::rbegin(this->submissions), std::rend(this->submissions), [&](const auto& submission) { return std::ranges::any_of( @@ -162,13 +165,13 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) { }); }); - if (start_iter == std::rend(this->submissions)) { - std::cout << "couldn't find starting submission!\n"; + if (acquire_iter == std::rend(this->submissions)) { + std::cerr << "couldn't find starting submission!\n"; return; } - const auto& start = *start_iter; + const auto& acquire = *acquire_iter; - const auto end_iter = std::ranges::find_if( + const auto present_iter = std::ranges::find_if( std::rbegin(this->submissions), std::rend(this->submissions), [&](const auto& submission) { return std::ranges::any_of( @@ -176,43 +179,61 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) { [&](const auto& signal) { return waits.contains(signal); }); }); - if (end_iter == std::rend(this->submissions)) { - std::cout << "couldn't find ending submission!\n"; + if (present_iter == std::rend(this->submissions)) { + std::cerr << "couldn't find ending submission!\n"; return; } - const auto& end = *end_iter; + const auto& end = *present_iter; + + std::cerr << "present for queue: " << queue << ", our waits:\n"; + for (const auto& wait : waits) { + std::cerr << " " << wait << '\n'; + } + + // The work including and between acquire -> present is effectively + // guaranteed to contribute to our frame. We are going to mark this point + // for future queues to read the 'start of frame' from. + (*present_iter)->end_of_frame_marker = true; + + // Now we read backwards to try to find our true start, starting at our + // acquire. + const auto start_iter = std::prev(std::ranges::find_if( + std::next(acquire_iter), std::rend(this->submissions), + [](const auto& submission) { + return submission->end_of_frame_marker; + })); + const auto& start = *start_iter; + + // start iter can't be end cause it's prev'd. auto frame = Frame{.start = Frame::Timepoint{ .context = *this, .handle = start->start_handle, - .sequence = start->sequence, }, .end = Frame::Timepoint{ .context = *this, .handle = end->end_handle, - .sequence = end->sequence, }}; this->in_flight_frames.emplace_back( std::make_unique<Frame>(std::move(frame))); - - // hack - if (this->in_flight_frames.size() > 5) { - this->in_flight_frames.pop_front(); - } } -std::optional<QueueContext::duration_t> QueueContext::get_delay_time() { +const auto debug_log_time = [](const auto& diff) { + using namespace std::chrono; + const auto ms = duration_cast<milliseconds>(diff); + const auto us = duration_cast<microseconds>(diff - ms); + const auto ns = duration_cast<nanoseconds>(diff - ms - us); + std::cerr << ms << " " << us << " " << ns << "\n"; +}; + +void QueueContext::process_frames() { if (!std::size(this->in_flight_frames)) { - return std::nullopt; + return; } - // We are about to query the wait semaphores of all of our current - // frames in flight. They may come from the same device, so we're going - // to build a mapping here to reduce vulkan calls. Not only that, - // we have to do this or else our timing information becomes broken - // as this loop iterates. - const auto target_devices = [this]() -> auto { + // Collect all devices and call calibrate. + [this]() -> auto { using context_ref_t = std::reference_wrapper<DeviceContext>; auto target_devices = std::unordered_map<VkDevice, context_ref_t>{}; for (const auto& frame : this->in_flight_frames) { @@ -222,237 +243,139 @@ std::optional<QueueContext::duration_t> QueueContext::get_delay_time() { target_devices.try_emplace(start.device, std::ref(start)); target_devices.try_emplace(end.device, std::ref(end)); } - return target_devices; - }(); - - // Calibrate timestamps before we acquire semaphores. - for (const auto& pair : target_devices) { - auto& device = pair.second; - device_context.clock.calibrate(); - } - - // Now we have all owned devices and their clocks are in a good state. - // We need to build another mapping of semaphores to their queries now. - const auto queue_sequences = [this]() -> auto { - auto queue_sequences = std::unordered_map<VkQueue, std::uint64_t>{}; - for (const auto& frame : this->in_flight_frames) { - auto& start = frame->start.context; - auto& end = frame->end.context; - - for (const auto& queue_ptr : {&start, &end}) { - if (queue_sequences.contains(queue_ptr->queue)) { - continue; - } - - const auto& vtable = queue_ptr->device_context.vtable; - auto seq = std::uint64_t{}; - vtable.GetSemaphoreCounterValueKHR(this->device_context.device, - this->semaphore, &seq); - queue_sequences.emplace(queue_ptr->queue, seq); - } + for (const auto& pair : target_devices) { + auto& device = pair.second.get(); + device.clock.calibrate(); } - return queue_sequences; }(); - // Now all devices we are about to query are primed to query. - // We have all sequence numbers from all queus we could possibly query. - const auto S = std::size(this->in_flight_frames); - for (auto i = std::size_t{0}; i < S; ++i) { - assert(this->in_flight_frames[i]); - const auto& frame = *this->in_flight_frames[i]; - const auto& start = frame.start; - const auto& end = frame.end; - - std::cout << " Evaluating the frame that's " << S - i - 1 - << " behind\n"; - - std::cout << " target start seq: " << start.sequence << '\n'; - std::cout << " target end seq: " << end.sequence << '\n'; - - const auto start_seq_it = queue_sequences.find(start.context.queue); - assert(start_seq_it != std::end(queue_sequences)); - const auto& start_seq = start_seq_it->second; - if (start_seq < start.sequence) { - std::cout << " frame hasn't started yet !\n "; - continue; - } + const auto get_tick_time = [](const auto& timepoint) + -> std::optional<DeviceContext::Clock::time_point_t> { + const auto& handle = timepoint.handle; + const auto& context = timepoint.context; - /* - const auto start_ticks_opt = - start.handle->get_ticks(*start.context.timestamp_pool); - if (!start_ticks_opt.has_value()) { - std::cout << " frame hasn't started yet !\n "; + const auto ticks = handle->get_ticks(*context.timestamp_pool); + if (!ticks.has_value()) { + return std::nullopt; } + const auto& clock = context.device_context.clock; + return clock.ticks_to_time(*ticks); + }; - std::cout << " START TICKS: " << start_ticks << '\n'; - const auto start_time = - start.context.device_context.clock.ticks_to_time(start_ticks); - - { - using namespace std::chrono; - const auto diff = now - a; - const auto ms = duration_cast<milliseconds>(diff); - const auto us = duration_cast<microseconds>(diff - ms); - const auto ns = duration_cast<nanoseconds>(diff - ms - us); - std::cout << " frame started: " << ms << " ms " << us - << " us " << ns << " ns ago\n"; - } + std::cerr << "starting frame readout\n"; + while (std::size(this->in_flight_frames)) { + const auto& frame = this->in_flight_frames.front(); + assert(frame); - const auto end_seq_it = queue_sequences.find(end.context.queue); - assert(end_seq_it != std::end(queue_sequences)); - const auto& end_seq = end_seq_it->second; - if (start_seq < end.sequence) { - std::cout << " frame hasn't started yet !\n "; - continue; + const auto a = get_tick_time(frame->start); + if (!a.has_value()) { + break; } - */ - } - - return std::nullopt; - // -} -// now it's all coming together -// std::optional<QueueContext::duration_t> QueueContext::get_delay_time() { -/* -if (!std::size(this->in_flight_frames)) { - return std::nullopt; -} - -auto seq = std::uint64_t{}; -this->device_context.vtable.GetSemaphoreCounterValueKHR( - this->device_context.device, this->semaphore, &seq); - -// Get semaphore first, then poll! -this->timestamp_pool->poll(); - -// idk how frequently we should call this. -this->device_context.calibrate_timestamps(); + const auto b = get_tick_time(frame->end); + if (!b.has_value()) { + break; + } -static auto gpu_frametimes = std::deque<uint64_t>{}; -static auto cpu_frametimes = std::deque<uint64_t>{}; + // assert(a <= b); -const auto S = std::size(this->in_flight_frames); + // + const auto last_b = + this->timings.empty() ? *a : this->timings.back()->gpu_end; -std::cout << "\nSTART FRAME READOUT\n"; -std::cout << "error bound: " << this->device_context.clock.error_bound - << '\n'; -std::cout << "num frames in flight: " << S << '\n'; -std::cout << "from oldest -> newest\n"; + // assert(last_b <= a); -// const auto b_seq = semaphore_from_context(*this); -const auto now = std::chrono::steady_clock::now(); + const auto frametime = *b - last_b; -auto i = std::size_t{0}; -for (; i < std::size(this->in_flight_frames); ++i) { - const auto& frame = this->in_flight_frames[i]; - std::cout << " Evaluating the frame that's " << S - i - 1 - << " behind\n"; - if (!frame) { - std::cout << " nullptr!\n"; - continue; - } + std::cerr + << " calculated total time from last frame (frametime): "; + debug_log_time(*b - last_b); - std::cout << " target start: " << frame->target_start_sequence << -'\n'; std::cout << " target end: " << frame->target_end_sequence << '\n'; if -(seq < frame->target_start_sequence) { std::cout << " frame hasn't -started yet!\n"; continue; - } + this->timings.emplace_back(std::make_unique<Timing>(Timing{ + .gpu_start = *a, + .gpu_end = *b, + .frametime = frametime, + })); - const auto start_ticks = - frame->start_context.timestamp_pool->get_polled(*frame->start); - std::cout << " START TICKS: " << start_ticks << '\n'; - const auto& a_clock = frame->start_context.device_context.clock; - const auto a = a_clock.ticks_to_time(start_ticks); - - { - using namespace std::chrono; - const auto diff = now - a; - const auto ms = duration_cast<milliseconds>(diff); - const auto us = duration_cast<microseconds>(diff - ms); - const auto ns = duration_cast<nanoseconds>(diff - ms - us); - std::cout << " frame started: " << ms << " ms " << us - << " us " << ns << " ns ago\n"; - } - - if (seq < frame->target_end_sequence) { - std::cout << " frame hasn't ended yet!\n"; - continue; - } - - - const auto end_ticks = - frame->end_context.timestamp_pool->get_polled(*frame->end, true); - const auto& b_clock = frame->end_context.device_context.clock; - std::cout << " END_TICKS: " << end_ticks << '\n'; - const auto b = b_clock.ticks_to_time(end_ticks); - { - using namespace std::chrono; - if (now <= b) { - std::cout << "b happened before now?\n"; - } - const auto diff = now - b; - const auto ms = duration_cast<milliseconds>(diff); - const auto us = duration_cast<microseconds>(diff - ms); - const auto ns = duration_cast<nanoseconds>(diff - ms - us); - std::cout << " frame ended: " << ms << " ms " << us - << " us " << ns << " ns ago\n"; + this->in_flight_frames.pop_front(); } - const auto gpu_time = b - a; - { - using namespace std::chrono; - const auto diff = gpu_time; - const auto ms = duration_cast<milliseconds>(diff); - const auto us = duration_cast<microseconds>(diff - ms); - const auto ns = duration_cast<nanoseconds>(diff - ms - us); - std::cout << " gpu_time: " << ms << " ms " << us - << " us " << ns << " ns ago\n"; + const auto MAX_TRACKED = 50; + if (std::size(this->timings) < MAX_TRACKED) { + return; } - - /* - cpu_frametimes.emplace_back(cpu_time); - gpu_frametimes.emplace_back(gpu_time); -} - -/* -if (remove_index.has_value()) { - this->in_flight_frames.erase(std::begin(this->in_flight_frames), - std::begin(this->in_flight_frames) + - *remove_index); + this->timings.erase(std::begin(this->timings), + std::next(std::begin(this->timings), + std::size(this->timings) - MAX_TRACKED)); } -*/ - -/* -auto g_copy = gpu_frametimes; -auto c_copy = cpu_frametimes; -std::ranges::sort(g_copy); -std::ranges::sort(c_copy); - -constexpr auto N = 49; -if (std::size(cpu_frametimes) < N) { - return std::nullopt; -} - -const auto F = std::size(g_copy); -// close enough to median lol -const auto g = g_copy[F / 2]; -const auto c = c_copy[F / 2]; -std::cout << g << '\n'; +using opt_time_point_t = std::optional<DeviceContext::Clock::time_point_t>; +opt_time_point_t QueueContext::get_sleep_until() { -std::cout << " median gpu: " << (g / 1'000'000) << " ms " << g / 1'000 - << " us " << g << " ns\n"; -std::cout << " median cpu: " << c / 1'000'000 << " ms " << c / 1'000 - << " us " << c << " ns\n"; + // Call this to push all in flight frames into our timings structure, + // but only if they're completed. So now they are truly *in flight frames*. + this->process_frames(); + + // We have completed all frames. DO NOT WAIT! + if (!std::size(this->in_flight_frames)) { + return std::nullopt; + } -if (F > N) { - gpu_frametimes.pop_front(); - cpu_frametimes.pop_front(); -} + const auto median_frametime = [&, this]() { + auto vect = std::vector<Timing*>{}; + std::ranges::transform(this->timings, std::back_inserter(vect), + [](const auto& timing) { return timing.get(); }); + std::ranges::sort(vect, [](const auto& a, const auto& b) { + return a->frametime < b->frametime; + }); + return vect[std::size(vect) / 2]->frametime; + }(); -return std::nullopt; + // PRESENT CALL + // | -------x----- | -------x--------------| + // ^ last_b ^ a ^ b + // + // Us, the CPU on the host, is approximately at 'b'. + // We have a good guess for the distance between + // last_b and b (median_frametime). + // The GPU is at any point on this line (marked as x). + // Don't use A. It's less robust than just using last_b. + // It *might* be more accurate because it's closer, + // but there's an issue where there can sometimes be a very + // small distance between a and b because it is just the + // point in time when the vkAcquireSwapchainKHR signals + // the wait on the gpu queue, which can sometimes be tiny. + + std::cerr << " median 100 frametimes: "; + debug_log_time(median_frametime); + + // 2% of average gpu time for dealing with variance. + // This could be calculated more precisely with the + // numbers we have (like we could construct a high% confidence + // interval? not big on maths). + const auto slack = median_frametime / 50; + + // If we're more than 1 frame queued, then we should wait for + // that to complete before returning. It's likely way better to + // to sleep twice here and recompute between sleeps because we're + // extrapolating really far into the future here! TODO + const auto extra_delay = + median_frametime * (std::size(this->in_flight_frames) - 1); + + const auto& last_b = this->timings.back()->gpu_end; + + // All educated guesses: + // dist_to_b = frametime - dist_to_last_b; + // dist_to_last_b = now - last_b + // sleep_until = now + extra_delay + slack + dist_to_b + // = now + extra_delay + slack + (frametime - dist_to_last_b) + // = now + extra_delay + slack + frametime - (now - last_b) + + const auto now = std::chrono::steady_clock::now(); + assert(last_b <= now); + const auto dist = now - last_b; + // Even if this is negative, it's a no-op to sleep backwards. + return now + extra_delay + slack + median_frametime - dist; } -*/ } // namespace low_latency
\ No newline at end of file diff --git a/src/queue_context.hh b/src/queue_context.hh index 3df6af4..6a71754 100644 --- a/src/queue_context.hh +++ b/src/queue_context.hh @@ -2,6 +2,7 @@ #define QUEUE_STATE_HH_ #include "context.hh" +#include "device_context.hh" #include "timestamp_pool.hh" #include <vulkan/utility/vk_dispatch_table.h> @@ -14,8 +15,6 @@ namespace low_latency { -class DeviceContext; - class QueueContext final : public Context { public: DeviceContext& device_context; @@ -23,6 +22,8 @@ class QueueContext final : public Context { const VkQueue queue; const std::uint32_t queue_family_index; + // I used to use these to signal when we could read timestamps until + // I realised you could use hostQueryReset. std::uint64_t semaphore_sequence = 0; VkSemaphore semaphore; @@ -30,14 +31,17 @@ class QueueContext final : public Context { std::unique_ptr<TimestampPool> timestamp_pool; + private: + static constexpr auto MAX_TRACKED_TIMINGS = 50; // Potentially in flight queue submissions struct Submission { const std::unordered_set<VkSemaphore> signals; const std::unordered_set<VkSemaphore> waits; - const std::uint64_t sequence; const std::shared_ptr<TimestampPool::Handle> start_handle; const std::shared_ptr<TimestampPool::Handle> end_handle; + + bool end_of_frame_marker = false; }; std::deque<std::shared_ptr<Submission>> submissions; @@ -48,7 +52,6 @@ class QueueContext final : public Context { struct Timepoint { const QueueContext& context; const std::shared_ptr<TimestampPool::Handle> handle; - const std::uint64_t sequence; }; const Timepoint start; @@ -56,6 +59,20 @@ class QueueContext final : public Context { }; std::deque<std::unique_ptr<Frame>> in_flight_frames; + struct Timing { + + DeviceContext::Clock::time_point_t gpu_start; + DeviceContext::Clock::time_point_t gpu_end; + + // Distance between the last gpu_end and this one. + // So one entire go around, including all cpu and gpu. + DeviceContext::Clock::time_point_t::duration frametime; + }; + std::deque<std::unique_ptr<Timing>> timings; + + private: + void process_frames(); + public: QueueContext(DeviceContext& device_context, const VkQueue& queue, const std::uint32_t& queue_family_index); @@ -63,18 +80,19 @@ class QueueContext final : public Context { public: void - notify_submit(const VkSubmitInfo& info, const std::uint64_t& sequence, + notify_submit(const VkSubmitInfo& info, const std::shared_ptr<TimestampPool::Handle> head_handle, const std::shared_ptr<TimestampPool::Handle> tail_handle); - // TODO submit2 + void + notify_submit(const VkSubmitInfo2& info, + const std::shared_ptr<TimestampPool::Handle> head_handle, + const std::shared_ptr<TimestampPool::Handle> tail_handle); void notify_present(const VkPresentInfoKHR& info); public: - // Computes the amount we should delay... - using duration_t = std::chrono::steady_clock::duration; - std::optional<duration_t> get_delay_time(); + std::optional<DeviceContext::Clock::time_point_t> get_sleep_until(); }; }; // namespace low_latency |
