From 50f009b81218c5367031ce9c51089ecddc2e853a Mon Sep 17 00:00:00 2001 From: Nicolas James Date: Tue, 24 Feb 2026 00:11:22 +1100 Subject: Cleanup, better document some areas --- src/device_context.cc | 67 ++++++++++++++++--------------------- src/device_context.hh | 8 ++--- src/layer.cc | 38 +++++++-------------- src/queue_context.cc | 93 ++++++++++++++++----------------------------------- src/queue_context.hh | 29 +++++++++++----- src/timestamp_pool.cc | 8 +++-- 6 files changed, 98 insertions(+), 145 deletions(-) (limited to 'src') diff --git a/src/device_context.cc b/src/device_context.cc index 2214b71..b149311 100644 --- a/src/device_context.cc +++ b/src/device_context.cc @@ -1,7 +1,6 @@ #include "device_context.hh" #include "queue_context.hh" -#include #include #include @@ -23,22 +22,6 @@ DeviceContext::~DeviceContext() { } } -void DeviceContext::notify_acquire(const VkSwapchainKHR& swapchain, - const std::uint32_t& image_index, - const VkSemaphore& signal_semaphore) { - - /* - std::cerr << "notify acquire for swapchain: " << swapchain << " : " - << image_index << '\n'; - std::cerr << " signal semaphore: " << signal_semaphore << '\n'; - */ - - const auto it = this->swapchain_signals.try_emplace(swapchain).first; - - // Doesn't matter if it was already there, overwrite it. - it->second.insert_or_assign(image_index, signal_semaphore); -} - DeviceContext::Clock::Clock(const DeviceContext& context) : device(context) { this->calibrate(); } @@ -92,17 +75,14 @@ DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const { return time_point_t{delta}; } -const auto debug_log_time2 = [](auto& stream, const auto& diff) { - using namespace std::chrono; - const auto ms = duration_cast(diff); - const auto us = duration_cast(diff - ms); - const auto ns = duration_cast(diff - ms - us); - stream << ms << " " << us << " " << ns << '\n'; -}; +void DeviceContext::notify_acquire(const VkSwapchainKHR& swapchain, + const std::uint32_t& image_index, + const VkSemaphore& signal_semaphore) { + const auto it = this->swapchain_signals.try_emplace(swapchain).first; -const auto debug_log_time = [](const auto& diff) { - debug_log_time2(std::cerr, diff); -}; + // Doesn't matter if it was already there, overwrite it. + it->second.insert_or_assign(image_index, signal_semaphore); +} void DeviceContext::sleep_in_input() { // Present hasn't happened yet, we don't know what queue to attack. @@ -110,26 +90,35 @@ void DeviceContext::sleep_in_input() { return; } - const auto before = std::chrono::steady_clock::now(); + const auto& frames = this->present_queue->in_flight_frames; + // No frame here means we're behind the GPU and do not need to delay. + // If anything we should speed up... + if (!std::size(frames)) { + return; + } + // If we're here, that means that there might be an outstanding frame that's // sitting on our present_queue which hasn't yet completed, so we need to // stall until it's finished. - const auto& frames = this->present_queue->in_flight_frames; - if (std::size(frames)) { - frames.back().submissions.back()->end_handle->get_time_spinlock(); - } - const auto after = std::chrono::steady_clock::now(); - //debug_log_time(after - before); - - // FIXME this should take into account 'cpu_time', which we currently do not... - // idk if it matters. + const auto& last_frame = frames.back(); + assert(std::size(last_frame.submissions)); + const auto& last_frame_submission = frames.back().submissions.back(); + last_frame_submission->end_handle->get_time_spinlock(); + + // From our sleep in present implementation, just spinning until + // the previous frame has completed did not work well. This was because + // there was a delay between presentation and when new work was given + // to the GPU. If we stalled the CPU without trying to account for this, we + // would get huge frame drops, loss of throughput, and the GPU would even + // clock down. So naturally I am concerned about this approach, but it seems + // to perform well so far in my own testing and is just beautifully elegant. } void DeviceContext::notify_antilag_update(const VkAntiLagDataAMD& data) { this->antilag_mode = data.mode; - this->antilag_fps = data.maxFPS; + this->antilag_fps = data.maxFPS; // TODO - // This might not be provided (probably just to set some settings). + // This might not be provided (probably just to set some settings?). if (!data.pPresentationInfo) { return; } diff --git a/src/device_context.hh b/src/device_context.hh index c73f97f..37817d5 100644 --- a/src/device_context.hh +++ b/src/device_context.hh @@ -2,7 +2,6 @@ #define DEVICE_CONTEXT_HH_ #include -#include #include #include @@ -30,6 +29,8 @@ struct DeviceContext final : public Context { std::unordered_map> queues; // We map swapchains to image indexes and their last signalled semaphore. + // FIXME: This isn't used right now, it was formerly used to map queue + // submissions but it ended up being unnecessary complexity. using index_semaphores_t = std::unordered_map; std::unordered_map swapchain_signals; @@ -54,7 +55,6 @@ struct DeviceContext final : public Context { }; Clock clock; - std::uint32_t antilag_fps = 0; VkAntiLagModeAMD antilag_mode = VK_ANTI_LAG_MODE_DRIVER_CONTROL_AMD; @@ -75,9 +75,9 @@ struct DeviceContext final : public Context { const std::uint32_t& image_index, const VkSemaphore& signal_semaphore); - // + // void notify_antilag_update(const VkAntiLagDataAMD& data); - + void notify_queue_present(const QueueContext& queue); }; diff --git a/src/layer.cc b/src/layer.cc index 12067a0..aea2154 100644 --- a/src/layer.cc +++ b/src/layer.cc @@ -1,6 +1,5 @@ #include "layer.hh" -#include #include #include #include @@ -302,8 +301,6 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice( DEVICE_VTABLE_LOAD(BeginCommandBuffer); DEVICE_VTABLE_LOAD(EndCommandBuffer); DEVICE_VTABLE_LOAD(ResetCommandBuffer); - DEVICE_VTABLE_LOAD(CmdDraw); - DEVICE_VTABLE_LOAD(CmdDrawIndexed); DEVICE_VTABLE_LOAD(CmdResetQueryPool); DEVICE_VTABLE_LOAD(GetDeviceQueue2); DEVICE_VTABLE_LOAD(QueueSubmit2); @@ -387,8 +384,7 @@ GetDeviceQueue(VkDevice device, std::uint32_t queue_family_index, device_context->queues.emplace(*queue, ptr); } -// Identical logic to gdq so some amount of duplication, we can't assume gdq1 is -// available apparently, what do I know? +// Identical logic to gdq1. static VKAPI_ATTR void VKAPI_CALL GetDeviceQueue2( VkDevice device, const VkDeviceQueueInfo2* info, VkQueue* queue) { @@ -454,11 +450,7 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count, const auto& queue_context = layer_context.get_context(queue); const auto& vtable = queue_context->device_context.vtable; - if (!submit_count) { // no-op submit we shouldn't worry about - return vtable.QueueSubmit(queue, submit_count, submit_infos, fence); - } - - if (!queue_context->should_inject_timestamps()) { + if (!submit_count || !queue_context->should_inject_timestamps()) { return vtable.QueueSubmit(queue, submit_count, submit_infos, fence); } @@ -476,17 +468,15 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count, // alone. // 2. Semaphores only signal at the end of their work, so we cannot use // them as a mechanism to know if work has started without doing - // another dummy submission. This adds complexity and also skews our - // timestamps slightly. - // 3. Semaphores can be waited which sounds nice in theory, but in my - // own testing waiting on semaphores can cause scheduling issues and - // cause wakeups as late as 1ms from when it was signalled, which is - // unbelievably bad if we're trying to do frame pacing. This means - // we are going to have to do a spinlock poll anyway. - // 4. Guess what info we need? Timestamp information. Guess what - // supports polling of an availability bit? Timestamp information. - // Why bother with semaphores at all then? Polling a semaphore might - // be faster, but the difference appears to be negligible. + // another dummy submission. This adds complexity and also might + // skew our timestamps slightly as they wouldn't be a part of the + // submission which contained those command buffers. + // 3. Timestamps support querying if their work has started/ended + // as long as we use the vkHostQueryReset extension to reset them + // before we consider them queryable. This means we don't need a + // 'is it valid to query' timeline semaphore. + // 4. The performance impact of using semaphores vs timestamps is + // negligable. using cbs_t = std::vector; auto next_submits = std::vector{}; @@ -541,11 +531,7 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count, const auto& queue_context = layer_context.get_context(queue); const auto& vtable = queue_context->device_context.vtable; - if (!submit_count) { - return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence); - } - - if (!queue_context->should_inject_timestamps()) { + if (!submit_count || !queue_context->should_inject_timestamps()) { return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence); } diff --git a/src/queue_context.cc b/src/queue_context.cc index 388019c..1f798de 100644 --- a/src/queue_context.cc +++ b/src/queue_context.cc @@ -12,31 +12,29 @@ namespace low_latency { -static VkCommandPool -make_command_pool(const DeviceContext& device_context, - const std::uint32_t& queue_family_index) { - - const auto cpci = VkCommandPoolCreateInfo{ - .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, - .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | - VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, - .queueFamilyIndex = queue_family_index, - }; - - auto command_pool = VkCommandPool{}; - device_context.vtable.CreateCommandPool(device_context.device, &cpci, - nullptr, &command_pool); - return command_pool; -} - QueueContext::QueueContext(DeviceContext& device_context, const VkQueue& queue, const std::uint32_t& queue_family_index) : device_context(device_context), queue(queue), - queue_family_index(queue_family_index), - // Important we make the command pool before the timestamp pool, because - // it's a dependency. - command_pool(make_command_pool(device_context, queue_family_index)), - timestamp_pool(std::make_unique(*this)) {} + queue_family_index(queue_family_index) { + + // Important we make the command pool before the timestamp pool, because + // it's a dependency. + this->command_pool = [&]() { + const auto cpci = VkCommandPoolCreateInfo{ + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | + VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .queueFamilyIndex = queue_family_index, + }; + + auto command_pool = VkCommandPool{}; + device_context.vtable.CreateCommandPool(device_context.device, &cpci, + nullptr, &command_pool); + return command_pool; + }(); + + this->timestamp_pool = std::make_unique(*this); +} QueueContext::~QueueContext() { @@ -63,27 +61,15 @@ void QueueContext::notify_submit( std::span{info.pSignalSemaphores, info.signalSemaphoreCount}, std::inserter(signals, std::end(signals))); - /* - std::cerr << "submit1 notif for queue " << this->queue << '\n'; - std::cerr << " signals: \n"; - for (const auto& signal : signals) { - std::cerr << " " << signal << '\n'; - } - std::cerr << " waits: \n"; - for (const auto& wait : waits) { - std::cerr << " " << wait << '\n'; - } - */ - this->submissions.emplace_back(std::make_unique( std::move(signals), std::move(waits), head_handle, tail_handle, now)); - // TODO HACK - if (std::size(this->submissions) > 100) { + if (std::size(this->submissions) > this->MAX_TRACKED_SUBMISSIONS) { this->submissions.pop_front(); } } +// Identical to notify_submit, but we use VkSubmitInfo2. void QueueContext::notify_submit( const VkSubmitInfo2& info, const std::shared_ptr head_handle, @@ -103,23 +89,10 @@ void QueueContext::notify_submit( std::inserter(signals, std::end(signals)), [](const auto& info) -> auto { return info.semaphore; }); - /* - std::cerr << "submit2 notif for queue " << this->queue << '\n'; - std::cerr << " signals: \n"; - for (const auto& signal : signals) { - std::cerr << " " << signal << '\n'; - } - std::cerr << " waits: \n"; - for (const auto& wait : waits) { - std::cerr << " " << wait << '\n'; - } - */ - this->submissions.emplace_back(std::make_unique( std::move(signals), std::move(waits), head_handle, tail_handle, now)); - // TODO HACK - if (std::size(this->submissions) > 100) { + if (std::size(this->submissions) > this->MAX_TRACKED_SUBMISSIONS) { this->submissions.pop_front(); } } @@ -139,7 +112,6 @@ void QueueContext::drain_submissions_to_frame() { const auto start_iter = std::begin(this->submissions); // no op submit? if (start_iter == std::end(this->submissions)) { - std::cerr << "ignored no op submit\n"; return; } const auto last_iter = std::prev(std::end(this->submissions)); @@ -208,20 +180,14 @@ void QueueContext::drain_frames_to_timings() { return; } - // We used to collect all devices that were pointed to by all potential - // submissions, put them in a set and then call.calibrate() on each once. - // This is unnecessary now - we assume all submissions come from the same - // queue. FIXME: don't assume this. - auto& device_context = this->device_context; - auto& clock = device_context.clock; - clock.calibrate(); + // Only need to calibrate this device, we don't support multi device anti + // lag. + this->device_context.clock.calibrate(); while (std::size(this->in_flight_frames)) { const auto& frame = this->in_flight_frames.front(); - if (!std::size(frame.submissions)) { - break; - } + assert(std::size(frame.submissions)); const auto& last_submission = frame.submissions.back(); @@ -335,11 +301,8 @@ void QueueContext::sleep_in_present() { const auto& device = this->device_context; const auto& vtable = device.vtable; - // Call this to push all in flight frames into our timings structure, - // but only if they're completed. So now they are truly *in flight - // frames*. + // After calling this, any remaining frames are truly *in fligh*. this->drain_frames_to_timings(); - if (!std::size(this->in_flight_frames)) { return; } diff --git a/src/queue_context.hh b/src/queue_context.hh index 2a3ea39..fbb04e8 100644 --- a/src/queue_context.hh +++ b/src/queue_context.hh @@ -16,6 +16,21 @@ namespace low_latency { class QueueContext final : public Context { + private: + // The amount of finished frame timing data we keep before eviction. + // For now, this value is also the number of data points used in the + // calculation of gpu timing information. + static constexpr auto MAX_TRACKED_TIMINGS = 50; + // The amount of queue submissions we allow tracked per queue before + // we give up tracking them. For a queue that is presented to, + // these submissions will be constantly moved to Frame structs so + // it's not an issue that we only track so many - unless it just + // happens that an application makes an unexpectedly large + // amount of vkQueueSubmit's per frame. For queues which don't + // present, this limit stops them from growing limitlessly in memory + // as we may not necessarily manually evict them yet. + static constexpr auto MAX_TRACKED_SUBMISSIONS = 50; + public: DeviceContext& device_context; @@ -27,8 +42,6 @@ class QueueContext final : public Context { std::unique_ptr timestamp_pool; public: - static constexpr auto MAX_TRACKED_TIMINGS = 50; - // Potentially in flight queue submissions that come from this queue. struct Submission { const std::unordered_set signals; @@ -36,7 +49,7 @@ class QueueContext final : public Context { const std::shared_ptr start_handle; const std::shared_ptr end_handle; - + const DeviceContext::Clock::time_point_t enqueued_time; }; using submission_ptr_t = std::shared_ptr; @@ -50,8 +63,8 @@ class QueueContext final : public Context { struct Frame { std::deque submissions; - // the point that control flow was returned from VkQueuePresentKHR back to the - // application. + // the point that control flow was returned from VkQueuePresentKHR back + // to the application. DeviceContext::Clock::time_point_t cpu_post_present_time; }; std::deque in_flight_frames; @@ -67,11 +80,11 @@ class QueueContext final : public Context { private: // Drains submissions and promotes them into a single frame object. void drain_submissions_to_frame(); - + // Drains in flight frames and promotes them into a Timing object if they // have completed. void drain_frames_to_timings(); - + // Antilag 1 equivalent where we sleep after present to reduce queueing. void sleep_in_present(); @@ -92,7 +105,7 @@ class QueueContext final : public Context { const DeviceContext::Clock::time_point_t& now); void notify_present(const VkPresentInfoKHR& info); - + public: bool should_inject_timestamps() const; }; diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc index 5149747..e8ef9f5 100644 --- a/src/timestamp_pool.cc +++ b/src/timestamp_pool.cc @@ -3,8 +3,8 @@ #include "queue_context.hh" #include -#include #include +#include #include #include @@ -25,8 +25,10 @@ TimestampPool::QueryChunk::QueryChunk(const QueueContext& queue_context) { return qp; }(); - constexpr auto KEY_RANGE = std::views::iota(0u, QueryChunk::CHUNK_SIZE); - this->free_indices = std::make_unique(std::from_range, KEY_RANGE); + this->free_indices = []() { + constexpr auto KEYS = std::views::iota(0u, QueryChunk::CHUNK_SIZE); + return std::make_unique(std::from_range, KEYS); + }(); this->command_buffers = [&, this]() -> auto { auto cbs = std::make_unique>(CHUNK_SIZE); -- cgit v1.2.3