6 files changed, 98 insertions, 145 deletions
diff --git a/src/device_context.cc b/src/device_context.cc
index 2214b71..b149311 100644
--- a/src/device_context.cc
+++ b/src/device_context.cc
@@ -1,7 +1,6 @@
 #include "device_context.hh"
 #include "queue_context.hh"
 
-#include <iostream>
 #include <utility>
 #include <vulkan/vulkan_core.h>
 
@@ -23,22 +22,6 @@ DeviceContext::~DeviceContext() {
     }
 }
 
-void DeviceContext::notify_acquire(const VkSwapchainKHR& swapchain,
-                                   const std::uint32_t& image_index,
-                                   const VkSemaphore& signal_semaphore) {
-
-    /*
-    std::cerr << "notify acquire for swapchain: " << swapchain << " : "
-              << image_index << '\n';
-    std::cerr << "    signal semaphore: " << signal_semaphore << '\n';
-    */
-
-    const auto it = this->swapchain_signals.try_emplace(swapchain).first;
-
-    // Doesn't matter if it was already there, overwrite it.
-    it->second.insert_or_assign(image_index, signal_semaphore);
-}
-
 DeviceContext::Clock::Clock(const DeviceContext& context) : device(context) {
     this->calibrate();
 }
@@ -92,17 +75,14 @@ DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const {
     return time_point_t{delta};
 }
 
-const auto debug_log_time2 = [](auto& stream, const auto& diff) {
-    using namespace std::chrono;
-    const auto ms = duration_cast<milliseconds>(diff);
-    const auto us = duration_cast<microseconds>(diff - ms);
-    const auto ns = duration_cast<nanoseconds>(diff - ms - us);
-    stream << ms << " " << us << " " << ns << '\n';
-};
+void DeviceContext::notify_acquire(const VkSwapchainKHR& swapchain,
+                                   const std::uint32_t& image_index,
+                                   const VkSemaphore& signal_semaphore) {
+    const auto it = this->swapchain_signals.try_emplace(swapchain).first;
 
-const auto debug_log_time = [](const auto& diff) {
-    debug_log_time2(std::cerr, diff);
-};
+    // Doesn't matter if it was already there, overwrite it.
+    it->second.insert_or_assign(image_index, signal_semaphore);
+}
 
 void DeviceContext::sleep_in_input() {
     // Present hasn't happened yet, we don't know what queue to attack.
@@ -110,26 +90,35 @@ void DeviceContext::sleep_in_input() {
         return;
     }
 
-    const auto before = std::chrono::steady_clock::now();
+    const auto& frames = this->present_queue->in_flight_frames;
+    // No frame here means we're behind the GPU and do not need to delay.
+    // If anything we should speed up...
+    if (!std::size(frames)) {
+        return;
+    }
+
     // If we're here, that means that there might be an outstanding frame that's
     // sitting on our present_queue which hasn't yet completed, so we need to
     // stall until it's finished.
-    const auto& frames = this->present_queue->in_flight_frames;
-    if (std::size(frames)) {
-        frames.back().submissions.back()->end_handle->get_time_spinlock();
-    }
-    const auto after = std::chrono::steady_clock::now();
-    //debug_log_time(after - before);
-    
-    // FIXME this should take into account 'cpu_time', which we currently do not...
-    // idk if it matters.
+    const auto& last_frame = frames.back();
+    assert(std::size(last_frame.submissions));
+    const auto& last_frame_submission = frames.back().submissions.back();
+    last_frame_submission->end_handle->get_time_spinlock();
+
+    // From our sleep in present implementation, just spinning until
+    // the previous frame has completed did not work well. This was because
+    // there was a delay between presentation and when new work was given
+    // to the GPU. If we stalled the CPU without trying to account for this, we
+    // would get huge frame drops, loss of throughput, and the GPU would even
+    // clock down. So naturally I am concerned about this approach, but it seems
+    // to perform well so far in my own testing and is just beautifully elegant.
 }
 
 void DeviceContext::notify_antilag_update(const VkAntiLagDataAMD& data) {
     this->antilag_mode = data.mode;
-    this->antilag_fps = data.maxFPS;
+    this->antilag_fps = data.maxFPS; // TODO
 
-    // This might not be provided (probably just to set some settings).
+    // This might not be provided (probably just to set some settings?).
     if (!data.pPresentationInfo) {
         return;
     }
diff --git a/src/device_context.hh b/src/device_context.hh
index c73f97f..37817d5 100644
--- a/src/device_context.hh
+++ b/src/device_context.hh
@@ -2,7 +2,6 @@
 #define DEVICE_CONTEXT_HH_
 
 #include <chrono>
-#include <deque>
 #include <memory>
 #include <unordered_map>
 
@@ -30,6 +29,8 @@ struct DeviceContext final : public Context {
     std::unordered_map<VkQueue, std::shared_ptr<QueueContext>> queues;
 
     // We map swapchains to image indexes and their last signalled semaphore.
+    // FIXME: This isn't used right now, it was formerly used to map queue
+    // submissions but it ended up being unnecessary complexity.
     using index_semaphores_t = std::unordered_map<std::uint32_t, VkSemaphore>;
     std::unordered_map<VkSwapchainKHR, index_semaphores_t> swapchain_signals;
 
@@ -54,7 +55,6 @@ struct DeviceContext final : public Context {
     };
     Clock clock;
 
-    
     std::uint32_t antilag_fps = 0;
     VkAntiLagModeAMD antilag_mode = VK_ANTI_LAG_MODE_DRIVER_CONTROL_AMD;
 
@@ -75,9 +75,9 @@ struct DeviceContext final : public Context {
                         const std::uint32_t& image_index,
                         const VkSemaphore& signal_semaphore);
 
-    // 
+    //
     void notify_antilag_update(const VkAntiLagDataAMD& data);
-    
+
     void notify_queue_present(const QueueContext& queue);
 };
 
diff --git a/src/layer.cc b/src/layer.cc
index 12067a0..aea2154 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -1,6 +1,5 @@
 #include "layer.hh"
 
-#include <iostream>
 #include <span>
 #include <string_view>
 #include <unordered_map>
@@ -302,8 +301,6 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
     DEVICE_VTABLE_LOAD(BeginCommandBuffer);
     DEVICE_VTABLE_LOAD(EndCommandBuffer);
     DEVICE_VTABLE_LOAD(ResetCommandBuffer);
-    DEVICE_VTABLE_LOAD(CmdDraw);
-    DEVICE_VTABLE_LOAD(CmdDrawIndexed);
     DEVICE_VTABLE_LOAD(CmdResetQueryPool);
     DEVICE_VTABLE_LOAD(GetDeviceQueue2);
     DEVICE_VTABLE_LOAD(QueueSubmit2);
@@ -387,8 +384,7 @@ GetDeviceQueue(VkDevice device, std::uint32_t queue_family_index,
     device_context->queues.emplace(*queue, ptr);
 }
 
-// Identical logic to gdq so some amount of duplication, we can't assume gdq1 is
-// available apparently, what do I know?
+// Identical logic to gdq1.
 static VKAPI_ATTR void VKAPI_CALL GetDeviceQueue2(
     VkDevice device, const VkDeviceQueueInfo2* info, VkQueue* queue) {
 
@@ -454,11 +450,7 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
     const auto& queue_context = layer_context.get_context(queue);
     const auto& vtable = queue_context->device_context.vtable;
 
-    if (!submit_count) { // no-op submit we shouldn't worry about
-        return vtable.QueueSubmit(queue, submit_count, submit_infos, fence);
-    }
-
-    if (!queue_context->should_inject_timestamps()) {
+    if (!submit_count || !queue_context->should_inject_timestamps()) { 
         return vtable.QueueSubmit(queue, submit_count, submit_infos, fence);
     }
 
@@ -476,17 +468,15 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
     //        alone.
     //     2. Semaphores only signal at the end of their work, so we cannot use
     //        them as a mechanism to know if work has started without doing
-    //        another dummy submission. This adds complexity and also skews our
-    //        timestamps slightly.
-    //     3. Semaphores can be waited which sounds nice in theory, but in my
-    //        own testing waiting on semaphores can cause scheduling issues and
-    //        cause wakeups as late as 1ms from when it was signalled, which is
-    //        unbelievably bad if we're trying to do frame pacing. This means
-    //        we are going to have to do a spinlock poll anyway.
-    //     4. Guess what info we need? Timestamp information. Guess what
-    //        supports polling of an availability bit? Timestamp information.
-    //        Why bother with semaphores at all then? Polling a semaphore might
-    //        be faster, but the difference appears to be negligible.
+    //        another dummy submission. This adds complexity and also might
+    //        skew our timestamps slightly as they wouldn't be a part of the
+    //        submission which contained those command buffers.
+    //     3. Timestamps support querying if their work has started/ended
+    //        as long as we use the vkHostQueryReset extension to reset them
+    //        before we consider them queryable. This means we don't need a
+    //        'is it valid to query' timeline semaphore.
+    //     4. The performance impact of using semaphores vs timestamps is
+    //        negligable. 
 
     using cbs_t = std::vector<VkCommandBuffer>;
     auto next_submits = std::vector<VkSubmitInfo>{};
@@ -541,11 +531,7 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
     const auto& queue_context = layer_context.get_context(queue);
     const auto& vtable = queue_context->device_context.vtable;
 
-    if (!submit_count) {
-        return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
-    }
-
-    if (!queue_context->should_inject_timestamps()) {
+    if (!submit_count || !queue_context->should_inject_timestamps()) {
         return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
     }
 
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 388019c..1f798de 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -12,31 +12,29 @@
 
 namespace low_latency {
 
-static VkCommandPool
-make_command_pool(const DeviceContext& device_context,
-                  const std::uint32_t& queue_family_index) {
-
-    const auto cpci = VkCommandPoolCreateInfo{
-        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
-        .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
-                 VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
-        .queueFamilyIndex = queue_family_index,
-    };
-
-    auto command_pool = VkCommandPool{};
-    device_context.vtable.CreateCommandPool(device_context.device, &cpci,
-                                            nullptr, &command_pool);
-    return command_pool;
-}
-
 QueueContext::QueueContext(DeviceContext& device_context, const VkQueue& queue,
                            const std::uint32_t& queue_family_index)
     : device_context(device_context), queue(queue),
-      queue_family_index(queue_family_index),
-      // Important we make the command pool before the timestamp pool, because
-      // it's a dependency.
-      command_pool(make_command_pool(device_context, queue_family_index)),
-      timestamp_pool(std::make_unique<TimestampPool>(*this)) {}
+      queue_family_index(queue_family_index) {
+
+    // Important we make the command pool before the timestamp pool, because
+    // it's a dependency.
+    this->command_pool = [&]() {
+        const auto cpci = VkCommandPoolCreateInfo{
+            .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+            .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
+                     VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+            .queueFamilyIndex = queue_family_index,
+        };
+
+        auto command_pool = VkCommandPool{};
+        device_context.vtable.CreateCommandPool(device_context.device, &cpci,
+                                                nullptr, &command_pool);
+        return command_pool;
+    }();
+
+    this->timestamp_pool = std::make_unique<TimestampPool>(*this);
+}
 
 QueueContext::~QueueContext() {
 
@@ -63,27 +61,15 @@ void QueueContext::notify_submit(
         std::span{info.pSignalSemaphores, info.signalSemaphoreCount},
         std::inserter(signals, std::end(signals)));
 
-    /*
-    std::cerr << "submit1 notif for queue " << this->queue << '\n';
-    std::cerr << "    signals: \n";
-    for (const auto& signal : signals) {
-        std::cerr << "      " << signal << '\n';
-    }
-    std::cerr << "    waits: \n";
-    for (const auto& wait : waits) {
-        std::cerr << "      " << wait << '\n';
-    }
-    */
-
     this->submissions.emplace_back(std::make_unique<Submission>(
         std::move(signals), std::move(waits), head_handle, tail_handle, now));
 
-    // TODO HACK
-    if (std::size(this->submissions) > 100) {
+    if (std::size(this->submissions) > this->MAX_TRACKED_SUBMISSIONS) {
         this->submissions.pop_front();
     }
 }
 
+// Identical to notify_submit, but we use VkSubmitInfo2.
 void QueueContext::notify_submit(
     const VkSubmitInfo2& info,
     const std::shared_ptr<TimestampPool::Handle> head_handle,
@@ -103,23 +89,10 @@ void QueueContext::notify_submit(
         std::inserter(signals, std::end(signals)),
         [](const auto& info) -> auto { return info.semaphore; });
 
-    /*
-    std::cerr << "submit2 notif for queue " << this->queue << '\n';
-    std::cerr << "    signals: \n";
-    for (const auto& signal : signals) {
-        std::cerr << "      " << signal << '\n';
-    }
-    std::cerr << "    waits: \n";
-    for (const auto& wait : waits) {
-        std::cerr << "      " << wait << '\n';
-    }
-    */
-
     this->submissions.emplace_back(std::make_unique<Submission>(
         std::move(signals), std::move(waits), head_handle, tail_handle, now));
 
-    // TODO HACK
-    if (std::size(this->submissions) > 100) {
+    if (std::size(this->submissions) > this->MAX_TRACKED_SUBMISSIONS) {
         this->submissions.pop_front();
     }
 }
@@ -139,7 +112,6 @@ void QueueContext::drain_submissions_to_frame() {
     const auto start_iter = std::begin(this->submissions);
     // no op submit?
     if (start_iter == std::end(this->submissions)) {
-        std::cerr << "ignored no op submit\n";
         return;
     }
     const auto last_iter = std::prev(std::end(this->submissions));
@@ -208,20 +180,14 @@ void QueueContext::drain_frames_to_timings() {
         return;
     }
 
-    // We used to collect all devices that were pointed to by all potential
-    // submissions, put them in a set and then call.calibrate() on each once.
-    // This is unnecessary now - we assume all submissions come from the same
-    // queue. FIXME: don't assume this.
-    auto& device_context = this->device_context;
-    auto& clock = device_context.clock;
-    clock.calibrate();
+    // Only need to calibrate this device, we don't support multi device anti
+    // lag.
+    this->device_context.clock.calibrate();
 
     while (std::size(this->in_flight_frames)) {
         const auto& frame = this->in_flight_frames.front();
 
-        if (!std::size(frame.submissions)) {
-            break;
-        }
+        assert(std::size(frame.submissions));
 
         const auto& last_submission = frame.submissions.back();
 
@@ -335,11 +301,8 @@ void QueueContext::sleep_in_present() {
     const auto& device = this->device_context;
     const auto& vtable = device.vtable;
 
-    // Call this to push all in flight frames into our timings structure,
-    // but only if they're completed. So now they are truly *in flight
-    // frames*.
+    // After calling this, any remaining frames are truly *in fligh*.
     this->drain_frames_to_timings();
-
     if (!std::size(this->in_flight_frames)) {
         return;
     }
diff --git a/src/queue_context.hh b/src/queue_context.hh
index 2a3ea39..fbb04e8 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -16,6 +16,21 @@
 namespace low_latency {
 
 class QueueContext final : public Context {
+  private:
+    // The amount of finished frame timing data we keep before eviction.
+    // For now, this value is also the number of data points used in the
+    // calculation of gpu timing information.
+    static constexpr auto MAX_TRACKED_TIMINGS = 50;
+    // The amount of queue submissions we allow tracked per queue before
+    // we give up tracking them. For a queue that is presented to,
+    // these submissions will be constantly moved to Frame structs so
+    // it's not an issue that we only track so many - unless it just 
+    // happens that an application makes an unexpectedly large
+    // amount of vkQueueSubmit's per frame. For queues which don't
+    // present, this limit stops them from growing limitlessly in memory
+    // as we may not necessarily manually evict them yet.
+    static constexpr auto MAX_TRACKED_SUBMISSIONS = 50;
+
   public:
     DeviceContext& device_context;
 
@@ -27,8 +42,6 @@ class QueueContext final : public Context {
     std::unique_ptr<TimestampPool> timestamp_pool;
 
   public:
-    static constexpr auto MAX_TRACKED_TIMINGS = 50;
-
     // Potentially in flight queue submissions that come from this queue.
     struct Submission {
         const std::unordered_set<VkSemaphore> signals;
@@ -36,7 +49,7 @@ class QueueContext final : public Context {
 
         const std::shared_ptr<TimestampPool::Handle> start_handle;
         const std::shared_ptr<TimestampPool::Handle> end_handle;
-        
+
         const DeviceContext::Clock::time_point_t enqueued_time;
     };
     using submission_ptr_t = std::shared_ptr<Submission>;
@@ -50,8 +63,8 @@ class QueueContext final : public Context {
     struct Frame {
         std::deque<submission_ptr_t> submissions;
 
-        // the point that control flow was returned from VkQueuePresentKHR back to the
-        // application. 
+        // the point that control flow was returned from VkQueuePresentKHR back
+        // to the application.
         DeviceContext::Clock::time_point_t cpu_post_present_time;
     };
     std::deque<Frame> in_flight_frames;
@@ -67,11 +80,11 @@ class QueueContext final : public Context {
   private:
     // Drains submissions and promotes them into a single frame object.
     void drain_submissions_to_frame();
-    
+
     // Drains in flight frames and promotes them into a Timing object if they
     // have completed.
     void drain_frames_to_timings();
-    
+
     // Antilag 1 equivalent where we sleep after present to reduce queueing.
     void sleep_in_present();
 
@@ -92,7 +105,7 @@ class QueueContext final : public Context {
                        const DeviceContext::Clock::time_point_t& now);
 
     void notify_present(const VkPresentInfoKHR& info);
-    
+
   public:
     bool should_inject_timestamps() const;
 };
diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc
index 5149747..e8ef9f5 100644
--- a/src/timestamp_pool.cc
+++ b/src/timestamp_pool.cc
@@ -3,8 +3,8 @@
 #include "queue_context.hh"
 
 #include <chrono>
-#include <span>
 #include <ranges>
+#include <span>
 #include <vulkan/utility/vk_dispatch_table.h>
 #include <vulkan/vulkan_core.h>
 
@@ -25,8 +25,10 @@ TimestampPool::QueryChunk::QueryChunk(const QueueContext& queue_context) {
         return qp;
     }();
 
-    constexpr auto KEY_RANGE = std::views::iota(0u, QueryChunk::CHUNK_SIZE);
-    this->free_indices = std::make_unique<free_indices_t>(std::from_range, KEY_RANGE);
+    this->free_indices = []() {
+        constexpr auto KEYS = std::views::iota(0u, QueryChunk::CHUNK_SIZE);
+        return std::make_unique<free_indices_t>(std::from_range, KEYS);
+    }();
 
     this->command_buffers = [&, this]() -> auto {
         auto cbs = std::make_unique<std::vector<VkCommandBuffer>>(CHUNK_SIZE);