From 56706244924987852e5ec610941bce8258ae647b Mon Sep 17 00:00:00 2001 From: Nicolas James Date: Mon, 23 Feb 2026 15:47:39 +1100 Subject: Implement AntiLag2 --- src/device_context.cc | 67 +++++++++++++++++++++++++++++++++++++++++++++++++-- src/device_context.hh | 16 ++++++++++++ src/layer.cc | 23 ++++++------------ src/queue_context.cc | 42 +++++++++++++++++++++++--------- src/queue_context.hh | 17 +++++++------ src/timestamp_pool.cc | 6 ++--- 6 files changed, 131 insertions(+), 40 deletions(-) diff --git a/src/device_context.cc b/src/device_context.cc index a8e0347..2214b71 100644 --- a/src/device_context.cc +++ b/src/device_context.cc @@ -1,7 +1,9 @@ #include "device_context.hh" +#include "queue_context.hh" #include #include +#include namespace low_latency { @@ -13,6 +15,7 @@ DeviceContext::DeviceContext(InstanceContext& parent_instance, device(device), vtable(std::move(vtable)), clock(*this) {} DeviceContext::~DeviceContext() { + this->present_queue.reset(); // We will let the destructor handle clearing here, but they should be // unique by now (ie, removed from the layer's context map). for (const auto& [queue, queue_context] : this->queues) { @@ -24,9 +27,11 @@ void DeviceContext::notify_acquire(const VkSwapchainKHR& swapchain, const std::uint32_t& image_index, const VkSemaphore& signal_semaphore) { + /* std::cerr << "notify acquire for swapchain: " << swapchain << " : " << image_index << '\n'; std::cerr << " signal semaphore: " << signal_semaphore << '\n'; + */ const auto it = this->swapchain_signals.try_emplace(swapchain).first; @@ -63,7 +68,7 @@ DeviceContext::Clock::time_point_t DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const { const auto& pd = device.physical_device.properties; const auto ns_tick = static_cast(pd->limits.timestampPeriod); - + const auto diff = [&]() -> auto { auto a = this->device_ticks; auto b = ticks; @@ -76,7 +81,7 @@ DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const { const auto signed_abs_diff = static_cast(abs_diff); return is_negative ? -signed_abs_diff : signed_abs_diff; }(); - + // This will have issues because std::chrono::steady_clock::now(), which // we use for cpu time, may not be on the same time domain what was returned // by GetCalibratedTimestamps. It would be more robust to use the posix @@ -87,4 +92,62 @@ DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const { return time_point_t{delta}; } +const auto debug_log_time2 = [](auto& stream, const auto& diff) { + using namespace std::chrono; + const auto ms = duration_cast(diff); + const auto us = duration_cast(diff - ms); + const auto ns = duration_cast(diff - ms - us); + stream << ms << " " << us << " " << ns << '\n'; +}; + +const auto debug_log_time = [](const auto& diff) { + debug_log_time2(std::cerr, diff); +}; + +void DeviceContext::sleep_in_input() { + // Present hasn't happened yet, we don't know what queue to attack. + if (!this->present_queue) { + return; + } + + const auto before = std::chrono::steady_clock::now(); + // If we're here, that means that there might be an outstanding frame that's + // sitting on our present_queue which hasn't yet completed, so we need to + // stall until it's finished. + const auto& frames = this->present_queue->in_flight_frames; + if (std::size(frames)) { + frames.back().submissions.back()->end_handle->get_time_spinlock(); + } + const auto after = std::chrono::steady_clock::now(); + //debug_log_time(after - before); + + // FIXME this should take into account 'cpu_time', which we currently do not... + // idk if it matters. +} + +void DeviceContext::notify_antilag_update(const VkAntiLagDataAMD& data) { + this->antilag_mode = data.mode; + this->antilag_fps = data.maxFPS; + + // This might not be provided (probably just to set some settings). + if (!data.pPresentationInfo) { + return; + } + + const auto& presentation_info = *data.pPresentationInfo; + // Only care about the input stage for now. + if (presentation_info.stage != VK_ANTI_LAG_STAGE_INPUT_AMD) { + return; + } + + if (this->antilag_mode == VK_ANTI_LAG_MODE_ON_AMD) { + this->sleep_in_input(); + } +} + +void DeviceContext::notify_queue_present(const QueueContext& queue) { + assert(this->queues.contains(queue.queue)); + this->present_queue = this->queues[queue.queue]; +} + } // namespace low_latency \ No newline at end of file diff --git a/src/device_context.hh b/src/device_context.hh index 310b8a7..c73f97f 100644 --- a/src/device_context.hh +++ b/src/device_context.hh @@ -2,6 +2,7 @@ #define DEVICE_CONTEXT_HH_ #include +#include #include #include @@ -53,6 +54,16 @@ struct DeviceContext final : public Context { }; Clock clock; + + std::uint32_t antilag_fps = 0; + VkAntiLagModeAMD antilag_mode = VK_ANTI_LAG_MODE_DRIVER_CONTROL_AMD; + + // The queue used in the last present. + std::shared_ptr present_queue; + + private: + void sleep_in_input(); + public: DeviceContext(InstanceContext& parent_instance, PhysicalDeviceContext& parent_physical, @@ -63,6 +74,11 @@ struct DeviceContext final : public Context { void notify_acquire(const VkSwapchainKHR& swapchain, const std::uint32_t& image_index, const VkSemaphore& signal_semaphore); + + // + void notify_antilag_update(const VkAntiLagDataAMD& data); + + void notify_queue_present(const QueueContext& queue); }; }; // namespace low_latency diff --git a/src/layer.cc b/src/layer.cc index b5287f8..12067a0 100644 --- a/src/layer.cc +++ b/src/layer.cc @@ -56,7 +56,8 @@ static const T* find_next(const void* const head, } template -static const T* find_link(const void* head, const VkStructureType& stype) { +static const T* find_link(const void* const head, + const VkStructureType& stype) { for (auto info = find_next(head, stype); info; info = find_next(info->pNext, stype)) { @@ -610,11 +611,7 @@ vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) { return res; } - if (present_info) { // might not be needed - queue_context->notify_present(*present_info); - } - - queue_context->sleep_in_present(); + queue_context->notify_present(*present_info); return VK_SUCCESS; } @@ -633,13 +630,13 @@ static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceExtensionProperties( physical_device, pLayerName, pPropertyCount, pProperties); } + auto& count = *pPropertyCount; // !pProperties means they're querying how much space they need. if (!pProperties) { - *pPropertyCount = 1; + count = 1; return VK_SUCCESS; } - auto& count = *pPropertyCount; // Defensive - they gave us zero space to work with. if (!count) { return VK_INCOMPLETE; @@ -678,14 +675,8 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2KHR( static VKAPI_ATTR void VKAPI_CALL AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) { - std::cerr << "low_latency::AntiLagUpdateAMD\n"; - std::cerr << " maxFPS: " << pData->maxFPS << '\n'; - std::cerr << " mode: " << pData->mode << '\n'; - std::cerr << " pPresentInfo: " << pData->pPresentationInfo->frameIndex - << '\n'; - std::cerr << " frameIndex: " << pData->pPresentationInfo->frameIndex - << '\n'; - std::cerr << " stage: " << pData->pPresentationInfo->stage << '\n'; + const auto device_context = layer_context.get_context(device); + device_context->notify_antilag_update(*pData); } } // namespace low_latency diff --git a/src/queue_context.cc b/src/queue_context.cc index d20cc79..388019c 100644 --- a/src/queue_context.cc +++ b/src/queue_context.cc @@ -8,6 +8,7 @@ #include #include #include +#include namespace low_latency { @@ -62,6 +63,7 @@ void QueueContext::notify_submit( std::span{info.pSignalSemaphores, info.signalSemaphoreCount}, std::inserter(signals, std::end(signals))); + /* std::cerr << "submit1 notif for queue " << this->queue << '\n'; std::cerr << " signals: \n"; for (const auto& signal : signals) { @@ -71,6 +73,7 @@ void QueueContext::notify_submit( for (const auto& wait : waits) { std::cerr << " " << wait << '\n'; } + */ this->submissions.emplace_back(std::make_unique( std::move(signals), std::move(waits), head_handle, tail_handle, now)); @@ -100,6 +103,7 @@ void QueueContext::notify_submit( std::inserter(signals, std::end(signals)), [](const auto& info) -> auto { return info.semaphore; }); + /* std::cerr << "submit2 notif for queue " << this->queue << '\n'; std::cerr << " signals: \n"; for (const auto& signal : signals) { @@ -109,6 +113,7 @@ void QueueContext::notify_submit( for (const auto& wait : waits) { std::cerr << " " << wait << '\n'; } + */ this->submissions.emplace_back(std::make_unique( std::move(signals), std::move(waits), head_handle, tail_handle, now)); @@ -119,7 +124,7 @@ void QueueContext::notify_submit( } } -void QueueContext::notify_present(const VkPresentInfoKHR& info) { +void QueueContext::drain_submissions_to_frame() { // We are going to assume that all queue submissions before and on the same // queue contribute to the frame. @@ -139,9 +144,6 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) { } const auto last_iter = std::prev(std::end(this->submissions)); - (*start_iter)->debug += "first_during_present "; - (*last_iter)->debug += "last_during_present "; - // The last submission is either in flight, already processed, or we // just happen to be the first frame and we can just set it to our start // with little consequence. @@ -173,6 +175,22 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) { this->submissions.clear(); } +void QueueContext::notify_present(const VkPresentInfoKHR& info) { + this->drain_submissions_to_frame(); + this->drain_frames_to_timings(); + + // Call up to notify the device now that we're done with this frame. + // We have to do this because antilag 2 data is sent to the device, not + // any particular queue. + this->device_context.notify_queue_present(*this); + + // If antilag is on, the sleep will occur in notify_antilag_update at the + // device context. + if (this->device_context.antilag_mode != VK_ANTI_LAG_MODE_ON_AMD) { + this->sleep_in_present(); + } +} + const auto debug_log_time2 = [](auto& stream, const auto& diff) { using namespace std::chrono; const auto ms = duration_cast(diff); @@ -185,7 +203,7 @@ const auto debug_log_time = [](const auto& diff) { debug_log_time2(std::cerr, diff); }; -void QueueContext::process_frames() { +void QueueContext::drain_frames_to_timings() { if (!std::size(this->in_flight_frames)) { return; } @@ -201,9 +219,9 @@ void QueueContext::process_frames() { while (std::size(this->in_flight_frames)) { const auto& frame = this->in_flight_frames.front(); - // There should be at least one submission, we guarantee it in - // notify_present. - assert(std::size(frame.submissions)); + if (!std::size(frame.submissions)) { + break; + } const auto& last_submission = frame.submissions.back(); @@ -320,7 +338,7 @@ void QueueContext::sleep_in_present() { // Call this to push all in flight frames into our timings structure, // but only if they're completed. So now they are truly *in flight // frames*. - this->process_frames(); + this->drain_frames_to_timings(); if (!std::size(this->in_flight_frames)) { return; @@ -335,9 +353,9 @@ void QueueContext::sleep_in_present() { return first_submission->start_handle->get_time_spinlock(); }(); - // Process frames because as stated above, we might have multiple frames - // now completed. - this->process_frames(); + // Drain frames again because as stated above, we might have multiple frames + // now completed after our wait spinlock. + this->drain_frames_to_timings(); // Check the size again because the frame we want to target may have already // completed when we called process_frames(). diff --git a/src/queue_context.hh b/src/queue_context.hh index 67b9c5d..2a3ea39 100644 --- a/src/queue_context.hh +++ b/src/queue_context.hh @@ -26,7 +26,7 @@ class QueueContext final : public Context { std::unique_ptr timestamp_pool; - private: + public: static constexpr auto MAX_TRACKED_TIMINGS = 50; // Potentially in flight queue submissions that come from this queue. @@ -38,8 +38,6 @@ class QueueContext final : public Context { const std::shared_ptr end_handle; const DeviceContext::Clock::time_point_t enqueued_time; - - std::string debug; }; using submission_ptr_t = std::shared_ptr; std::deque submissions; @@ -67,7 +65,15 @@ class QueueContext final : public Context { std::deque> timings; private: - void process_frames(); + // Drains submissions and promotes them into a single frame object. + void drain_submissions_to_frame(); + + // Drains in flight frames and promotes them into a Timing object if they + // have completed. + void drain_frames_to_timings(); + + // Antilag 1 equivalent where we sleep after present to reduce queueing. + void sleep_in_present(); public: QueueContext(DeviceContext& device_context, const VkQueue& queue, @@ -86,9 +92,6 @@ class QueueContext final : public Context { const DeviceContext::Clock::time_point_t& now); void notify_present(const VkPresentInfoKHR& info); - - public: - void sleep_in_present(); public: bool should_inject_timestamps() const; diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc index e482654..5149747 100644 --- a/src/timestamp_pool.cc +++ b/src/timestamp_pool.cc @@ -3,6 +3,7 @@ #include "queue_context.hh" #include +#include #include #include #include @@ -24,9 +25,8 @@ TimestampPool::QueryChunk::QueryChunk(const QueueContext& queue_context) { return qp; }(); - constexpr auto key_range = std::views::iota(0u, QueryChunk::CHUNK_SIZE); - this->free_indices = std::make_unique(std::begin(key_range), - std::end(key_range)); + constexpr auto KEY_RANGE = std::views::iota(0u, QueryChunk::CHUNK_SIZE); + this->free_indices = std::make_unique(std::from_range, KEY_RANGE); this->command_buffers = [&, this]() -> auto { auto cbs = std::make_unique>(CHUNK_SIZE); -- cgit v1.2.3