6 files changed, 131 insertions, 40 deletions
diff --git a/src/device_context.cc b/src/device_context.cc
index a8e0347..2214b71 100644
--- a/src/device_context.cc
+++ b/src/device_context.cc
@@ -1,7 +1,9 @@
 #include "device_context.hh"
+#include "queue_context.hh"
 
 #include <iostream>
 #include <utility>
+#include <vulkan/vulkan_core.h>
 
 namespace low_latency {
 
@@ -13,6 +15,7 @@ DeviceContext::DeviceContext(InstanceContext& parent_instance,
       device(device), vtable(std::move(vtable)), clock(*this) {}
 
 DeviceContext::~DeviceContext() {
+    this->present_queue.reset();
     // We will let the destructor handle clearing here, but they should be
     // unique by now (ie, removed from the layer's context map).
     for (const auto& [queue, queue_context] : this->queues) {
@@ -24,9 +27,11 @@ void DeviceContext::notify_acquire(const VkSwapchainKHR& swapchain,
                                    const std::uint32_t& image_index,
                                    const VkSemaphore& signal_semaphore) {
 
+    /*
     std::cerr << "notify acquire for swapchain: " << swapchain << " : "
               << image_index << '\n';
     std::cerr << "    signal semaphore: " << signal_semaphore << '\n';
+    */
 
     const auto it = this->swapchain_signals.try_emplace(swapchain).first;
 
@@ -63,7 +68,7 @@ DeviceContext::Clock::time_point_t
 DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const {
     const auto& pd = device.physical_device.properties;
     const auto ns_tick = static_cast<double>(pd->limits.timestampPeriod);
-    
+
     const auto diff = [&]() -> auto {
         auto a = this->device_ticks;
         auto b = ticks;
@@ -76,7 +81,7 @@ DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const {
         const auto signed_abs_diff = static_cast<std::int64_t>(abs_diff);
         return is_negative ? -signed_abs_diff : signed_abs_diff;
     }();
-    
+
     // This will have issues because std::chrono::steady_clock::now(), which
     // we use for cpu time, may not be on the same time domain what was returned
     // by GetCalibratedTimestamps. It would be more robust to use the posix
@@ -87,4 +92,62 @@ DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const {
     return time_point_t{delta};
 }
 
+const auto debug_log_time2 = [](auto& stream, const auto& diff) {
+    using namespace std::chrono;
+    const auto ms = duration_cast<milliseconds>(diff);
+    const auto us = duration_cast<microseconds>(diff - ms);
+    const auto ns = duration_cast<nanoseconds>(diff - ms - us);
+    stream << ms << " " << us << " " << ns << '\n';
+};
+
+const auto debug_log_time = [](const auto& diff) {
+    debug_log_time2(std::cerr, diff);
+};
+
+void DeviceContext::sleep_in_input() {
+    // Present hasn't happened yet, we don't know what queue to attack.
+    if (!this->present_queue) {
+        return;
+    }
+
+    const auto before = std::chrono::steady_clock::now();
+    // If we're here, that means that there might be an outstanding frame that's
+    // sitting on our present_queue which hasn't yet completed, so we need to
+    // stall until it's finished.
+    const auto& frames = this->present_queue->in_flight_frames;
+    if (std::size(frames)) {
+        frames.back().submissions.back()->end_handle->get_time_spinlock();
+    }
+    const auto after = std::chrono::steady_clock::now();
+    //debug_log_time(after - before);
+    
+    // FIXME this should take into account 'cpu_time', which we currently do not...
+    // idk if it matters.
+}
+
+void DeviceContext::notify_antilag_update(const VkAntiLagDataAMD& data) {
+    this->antilag_mode = data.mode;
+    this->antilag_fps = data.maxFPS;
+
+    // This might not be provided (probably just to set some settings).
+    if (!data.pPresentationInfo) {
+        return;
+    }
+
+    const auto& presentation_info = *data.pPresentationInfo;
+    // Only care about the input stage for now.
+    if (presentation_info.stage != VK_ANTI_LAG_STAGE_INPUT_AMD) {
+        return;
+    }
+
+    if (this->antilag_mode == VK_ANTI_LAG_MODE_ON_AMD) {
+        this->sleep_in_input();
+    }
+}
+
+void DeviceContext::notify_queue_present(const QueueContext& queue) {
+    assert(this->queues.contains(queue.queue));
+    this->present_queue = this->queues[queue.queue];
+}
+
 } // namespace low_latency
 \ No newline at end of file
diff --git a/src/device_context.hh b/src/device_context.hh
index 310b8a7..c73f97f 100644
--- a/src/device_context.hh
+++ b/src/device_context.hh
@@ -2,6 +2,7 @@
 #define DEVICE_CONTEXT_HH_
 
 #include <chrono>
+#include <deque>
 #include <memory>
 #include <unordered_map>
 
@@ -53,6 +54,16 @@ struct DeviceContext final : public Context {
     };
     Clock clock;
 
+    
+    std::uint32_t antilag_fps = 0;
+    VkAntiLagModeAMD antilag_mode = VK_ANTI_LAG_MODE_DRIVER_CONTROL_AMD;
+
+    // The queue used in the last present.
+    std::shared_ptr<QueueContext> present_queue;
+
+  private:
+    void sleep_in_input();
+
   public:
     DeviceContext(InstanceContext& parent_instance,
                   PhysicalDeviceContext& parent_physical,
@@ -63,6 +74,11 @@ struct DeviceContext final : public Context {
     void notify_acquire(const VkSwapchainKHR& swapchain,
                         const std::uint32_t& image_index,
                         const VkSemaphore& signal_semaphore);
+
+    // 
+    void notify_antilag_update(const VkAntiLagDataAMD& data);
+    
+    void notify_queue_present(const QueueContext& queue);
 };
 
 }; // namespace low_latency
diff --git a/src/layer.cc b/src/layer.cc
index b5287f8..12067a0 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -56,7 +56,8 @@ static const T* find_next(const void* const head,
 }
 
 template <typename T>
-static const T* find_link(const void* head, const VkStructureType& stype) {
+static const T* find_link(const void* const head,
+                          const VkStructureType& stype) {
     for (auto info = find_next<T>(head, stype); info;
          info = find_next<T>(info->pNext, stype)) {
 
@@ -610,11 +611,7 @@ vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) {
         return res;
     }
 
-    if (present_info) { // might not be needed
-        queue_context->notify_present(*present_info);
-    }
-
-    queue_context->sleep_in_present();
+    queue_context->notify_present(*present_info);
 
     return VK_SUCCESS;
 }
@@ -633,13 +630,13 @@ static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceExtensionProperties(
             physical_device, pLayerName, pPropertyCount, pProperties);
     }
 
+    auto& count = *pPropertyCount;
     // !pProperties means they're querying how much space they need.
     if (!pProperties) {
-        *pPropertyCount = 1;
+        count = 1;
         return VK_SUCCESS;
     }
 
-    auto& count = *pPropertyCount;
     // Defensive - they gave us zero space to work with.
     if (!count) {
         return VK_INCOMPLETE;
@@ -678,14 +675,8 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2KHR(
 
 static VKAPI_ATTR void VKAPI_CALL
 AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) {
-    std::cerr << "low_latency::AntiLagUpdateAMD\n";
-    std::cerr << "    maxFPS: " << pData->maxFPS << '\n';
-    std::cerr << "    mode: " << pData->mode << '\n';
-    std::cerr << "    pPresentInfo: " << pData->pPresentationInfo->frameIndex
-              << '\n';
-    std::cerr << "        frameIndex: " << pData->pPresentationInfo->frameIndex
-              << '\n';
-    std::cerr << "        stage: " << pData->pPresentationInfo->stage << '\n';
+    const auto device_context = layer_context.get_context(device);
+    device_context->notify_antilag_update(*pData);
 }
 
 } // namespace low_latency
diff --git a/src/queue_context.cc b/src/queue_context.cc
index d20cc79..388019c 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -8,6 +8,7 @@
 #include <iostream>
 #include <ranges>
 #include <span>
+#include <vulkan/vulkan_core.h>
 
 namespace low_latency {
 
@@ -62,6 +63,7 @@ void QueueContext::notify_submit(
         std::span{info.pSignalSemaphores, info.signalSemaphoreCount},
         std::inserter(signals, std::end(signals)));
 
+    /*
     std::cerr << "submit1 notif for queue " << this->queue << '\n';
     std::cerr << "    signals: \n";
     for (const auto& signal : signals) {
@@ -71,6 +73,7 @@ void QueueContext::notify_submit(
     for (const auto& wait : waits) {
         std::cerr << "      " << wait << '\n';
     }
+    */
 
     this->submissions.emplace_back(std::make_unique<Submission>(
         std::move(signals), std::move(waits), head_handle, tail_handle, now));
@@ -100,6 +103,7 @@ void QueueContext::notify_submit(
         std::inserter(signals, std::end(signals)),
         [](const auto& info) -> auto { return info.semaphore; });
 
+    /*
     std::cerr << "submit2 notif for queue " << this->queue << '\n';
     std::cerr << "    signals: \n";
     for (const auto& signal : signals) {
@@ -109,6 +113,7 @@ void QueueContext::notify_submit(
     for (const auto& wait : waits) {
         std::cerr << "      " << wait << '\n';
     }
+    */
 
     this->submissions.emplace_back(std::make_unique<Submission>(
         std::move(signals), std::move(waits), head_handle, tail_handle, now));
@@ -119,7 +124,7 @@ void QueueContext::notify_submit(
     }
 }
 
-void QueueContext::notify_present(const VkPresentInfoKHR& info) {
+void QueueContext::drain_submissions_to_frame() {
 
     // We are going to assume that all queue submissions before and on the same
     // queue contribute to the frame.
@@ -139,9 +144,6 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
     }
     const auto last_iter = std::prev(std::end(this->submissions));
 
-    (*start_iter)->debug += "first_during_present ";
-    (*last_iter)->debug += "last_during_present ";
-
     // The last submission is either in flight, already processed, or we
     // just happen to be the first frame and we can just set it to our start
     // with little consequence.
@@ -173,6 +175,22 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
     this->submissions.clear();
 }
 
+void QueueContext::notify_present(const VkPresentInfoKHR& info) {
+    this->drain_submissions_to_frame();
+    this->drain_frames_to_timings();
+
+    // Call up to notify the device now that we're done with this frame.
+    // We have to do this because antilag 2 data is sent to the device, not
+    // any particular queue.
+    this->device_context.notify_queue_present(*this);
+
+    // If antilag is on, the sleep will occur in notify_antilag_update at the
+    // device context.
+    if (this->device_context.antilag_mode != VK_ANTI_LAG_MODE_ON_AMD) {
+        this->sleep_in_present();
+    }
+}
+
 const auto debug_log_time2 = [](auto& stream, const auto& diff) {
     using namespace std::chrono;
     const auto ms = duration_cast<milliseconds>(diff);
@@ -185,7 +203,7 @@ const auto debug_log_time = [](const auto& diff) {
     debug_log_time2(std::cerr, diff);
 };
 
-void QueueContext::process_frames() {
+void QueueContext::drain_frames_to_timings() {
     if (!std::size(this->in_flight_frames)) {
         return;
     }
@@ -201,9 +219,9 @@ void QueueContext::process_frames() {
     while (std::size(this->in_flight_frames)) {
         const auto& frame = this->in_flight_frames.front();
 
-        // There should be at least one submission, we guarantee it in
-        // notify_present.
-        assert(std::size(frame.submissions));
+        if (!std::size(frame.submissions)) {
+            break;
+        }
 
         const auto& last_submission = frame.submissions.back();
 
@@ -320,7 +338,7 @@ void QueueContext::sleep_in_present() {
     // Call this to push all in flight frames into our timings structure,
     // but only if they're completed. So now they are truly *in flight
     // frames*.
-    this->process_frames();
+    this->drain_frames_to_timings();
 
     if (!std::size(this->in_flight_frames)) {
         return;
@@ -335,9 +353,9 @@ void QueueContext::sleep_in_present() {
         return first_submission->start_handle->get_time_spinlock();
     }();
 
-    // Process frames because as stated above, we might have multiple frames
-    // now completed.
-    this->process_frames();
+    // Drain frames again because as stated above, we might have multiple frames
+    // now completed after our wait spinlock.
+    this->drain_frames_to_timings();
 
     // Check the size again because the frame we want to target may have already
     // completed when we called process_frames().
diff --git a/src/queue_context.hh b/src/queue_context.hh
index 67b9c5d..2a3ea39 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -26,7 +26,7 @@ class QueueContext final : public Context {
 
     std::unique_ptr<TimestampPool> timestamp_pool;
 
-  private:
+  public:
     static constexpr auto MAX_TRACKED_TIMINGS = 50;
 
     // Potentially in flight queue submissions that come from this queue.
@@ -38,8 +38,6 @@ class QueueContext final : public Context {
         const std::shared_ptr<TimestampPool::Handle> end_handle;
         
         const DeviceContext::Clock::time_point_t enqueued_time;
-
-        std::string debug;
     };
     using submission_ptr_t = std::shared_ptr<Submission>;
     std::deque<submission_ptr_t> submissions;
@@ -67,7 +65,15 @@ class QueueContext final : public Context {
     std::deque<std::unique_ptr<Timing>> timings;
 
   private:
-    void process_frames();
+    // Drains submissions and promotes them into a single frame object.
+    void drain_submissions_to_frame();
+    
+    // Drains in flight frames and promotes them into a Timing object if they
+    // have completed.
+    void drain_frames_to_timings();
+    
+    // Antilag 1 equivalent where we sleep after present to reduce queueing.
+    void sleep_in_present();
 
   public:
     QueueContext(DeviceContext& device_context, const VkQueue& queue,
@@ -86,9 +92,6 @@ class QueueContext final : public Context {
                        const DeviceContext::Clock::time_point_t& now);
 
     void notify_present(const VkPresentInfoKHR& info);
-
-  public:
-    void sleep_in_present();
     
   public:
     bool should_inject_timestamps() const;
diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc
index e482654..5149747 100644
--- a/src/timestamp_pool.cc
+++ b/src/timestamp_pool.cc
@@ -3,6 +3,7 @@
 #include "queue_context.hh"
 
 #include <chrono>
+#include <span>
 #include <ranges>
 #include <vulkan/utility/vk_dispatch_table.h>
 #include <vulkan/vulkan_core.h>
@@ -24,9 +25,8 @@ TimestampPool::QueryChunk::QueryChunk(const QueueContext& queue_context) {
         return qp;
     }();
 
-    constexpr auto key_range = std::views::iota(0u, QueryChunk::CHUNK_SIZE);
-    this->free_indices = std::make_unique<free_indices_t>(std::begin(key_range),
-                                                          std::end(key_range));
+    constexpr auto KEY_RANGE = std::views::iota(0u, QueryChunk::CHUNK_SIZE);
+    this->free_indices = std::make_unique<free_indices_t>(std::from_range, KEY_RANGE);
 
     this->command_buffers = [&, this]() -> auto {
         auto cbs = std::make_unique<std::vector<VkCommandBuffer>>(CHUNK_SIZE);