5 files changed, 258 insertions, 362 deletions
diff --git a/src/device_context.cc b/src/device_context.cc
index f849df1..59d818e 100644
--- a/src/device_context.cc
+++ b/src/device_context.cc
@@ -2,6 +2,7 @@
 #include "queue_context.hh"
 
 #include <utility>
+#include <iostream>
 
 namespace low_latency {
 
@@ -24,6 +25,9 @@ DeviceContext::~DeviceContext() {
 void DeviceContext::notify_acquire(const VkSwapchainKHR& swapchain,
                                    const std::uint32_t& image_index,
                                    const VkSemaphore& signal_semaphore) {
+    
+    std::cerr << "notify acquire for swapchain: " << swapchain << " : " << image_index << '\n';
+    std::cerr << "    signal semaphore: " << signal_semaphore << '\n';
 
     const auto it = this->swapchain_signals.try_emplace(swapchain).first;
 
@@ -57,6 +61,7 @@ void DeviceContext::Clock::calibrate() {
     clock_gettime(CLOCK_MONOTONIC, &tv);
     return tv.tv_nsec + tv.tv_sec*1000000000ull;
     */
+
     const auto steady_before = std::chrono::steady_clock::now();
     device.vtable.GetCalibratedTimestampsKHR(device.device, 2, std::data(infos),
                                              &calibrated_result.device,
diff --git a/src/device_context.hh b/src/device_context.hh
index c08cec2..8a86cfb 100644
--- a/src/device_context.hh
+++ b/src/device_context.hh
@@ -36,7 +36,8 @@ struct DeviceContext final : public Context {
 
     struct Clock {
       public:
-        using time_point_t = std::chrono::steady_clock::time_point;
+        using time_point_t = std::chrono::time_point<std::chrono::steady_clock,
+                                                     std::chrono::nanoseconds>;
         const DeviceContext& device;
 
       public:
diff --git a/src/layer.cc b/src/layer.cc
index 1b1d9e7..f9917f6 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -1,5 +1,6 @@
 #include "layer.hh"
 
+#include <iostream>
 #include <memory>
 #include <span>
 #include <string_view>
@@ -278,7 +279,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
 
         return result;
     }
-    
+
 #define DEVICE_VTABLE_LOAD(name)                                               \
     .name = reinterpret_cast<PFN_vk##name>(gdpa(*pDevice, "vk" #name))
     auto vtable = VkuDeviceDispatchTable{
@@ -457,23 +458,19 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
     }
 
     // We have to avoid casting away the const* of the passed VkSubmitInfos.
-    // We wrap every single submission with *two* extra VkSubmitInfos to
-    // accomplish this. The first executes a command buffer that 
+    // So we end up copying a lot of stuff and wrapping them in unique_ptrs
+    // so their position in memory is stable.
 
     using cb_vect = std::vector<VkCommandBuffer>;
     using tssi_ptr_t = std::unique_ptr<VkTimelineSemaphoreSubmitInfo>;
     auto next_submits = std::vector<VkSubmitInfo>{};
     auto next_cbs = std::vector<std::unique_ptr<cb_vect>>{};
-    auto next_signals = std::vector<std::unique_ptr<std::uint64_t>>{};
-    auto next_tssis = std::vector<tssi_ptr_t>{};
     auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{};
 
     for (const auto& submit_info : std::span{submit_infos, submit_count}) {
         const auto head_handle = queue_context->timestamp_pool->acquire();
         const auto tail_handle = queue_context->timestamp_pool->acquire();
 
-        // Head is special as we need to inject a CB into a copy of
-        // their command buffers that records the time the waits completed.
         next_cbs.emplace_back([&]() -> auto {
             auto cbs = std::make_unique<std::vector<VkCommandBuffer>>();
             head_handle->setup_command_buffers(*tail_handle, *queue_context);
@@ -488,42 +485,13 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
         next_submits.back().pCommandBuffers = std::data(*next_cbs.back());
         next_submits.back().commandBufferCount = std::size(*next_cbs.back());
 
-        const auto next_signal = 1 + queue_context->semaphore_sequence++;
-
-        next_signals.push_back(std::make_unique<std::uint64_t>(next_signal));
-
-        next_tssis.push_back(std::make_unique<VkTimelineSemaphoreSubmitInfo>(
-            VkTimelineSemaphoreSubmitInfo{
-                .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR,
-                .signalSemaphoreValueCount = 1,
-                .pSignalSemaphoreValues = next_signals.back().get(),
-            }));
-        next_submits.push_back(VkSubmitInfo{
-            .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
-            .pNext = next_tssis.back().get(),
-            .commandBufferCount = 1,
-            .pCommandBuffers = &tail_handle->command_buffer,
-            .signalSemaphoreCount = 1,
-            .pSignalSemaphores = &queue_context->semaphore,
-        });
-
-        queue_context->notify_submit(submit_info, next_signal, head_handle,
-                                     tail_handle);
-
+        queue_context->notify_submit(submit_info, head_handle, tail_handle);
         handles.push_back(head_handle);
         handles.push_back(tail_handle);
     }
 
-    if (const auto res = vtable.QueueSubmit(queue, std::size(next_submits),
-                                            std::data(next_submits), fence);
-        res != VK_SUCCESS) {
-
-        return res;
-    }
-
-    // ?!?
-
-    return VK_SUCCESS;
+    return vtable.QueueSubmit(queue, std::size(next_submits),
+                              std::data(next_submits), fence);
 }
 
 // The logic for this function is identical to vkSubmitInfo.
@@ -531,71 +499,52 @@ static VKAPI_ATTR VkResult VKAPI_CALL
 vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
                const VkSubmitInfo2* submit_infos, VkFence fence) {
 
-    const auto queue_context = layer_context.get_context(queue);
+    const auto& queue_context = layer_context.get_context(queue);
     const auto& vtable = queue_context->device_context.vtable;
 
-    // TODO
-    if (!submit_count || true) {
+    if (!submit_count) {
         return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
     }
 
-    /*
-    auto timestamp_handle = queue_context->timestamp_pool->acquire();
-    timestamp_handle->setup_command_buffers(vtable);
-    const auto& [head_cb, tail_cb] = timestamp_handle->command_buffers;
-
-    const auto next_command_buffers = [&]() -> auto {
-        auto next_command_buffers = std::vector<VkCommandBufferSubmitInfo>{};
-        next_command_buffers.push_back(VkCommandBufferSubmitInfo{
-            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
-            .commandBuffer = head_cb,
-        });
-        std::ranges::copy_n(submit_infos[0].pCommandBufferInfos,
-                            submit_infos[0].commandBufferInfoCount,
-                            std::back_inserter(next_command_buffers));
-        return next_command_buffers;
-    }();
+    using cb_vect_t = std::vector<VkCommandBufferSubmitInfo>;
+    auto next_submits = std::vector<VkSubmitInfo2>{};
+    auto next_cbs = std::vector<std::unique_ptr<cb_vect_t>>{};
+    auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{};
 
-    auto next_submit_infos = std::vector<VkSubmitInfo2>();
-    std::ranges::copy_n(submit_infos, submit_count,
-                        std::back_inserter(next_submit_infos));
-    next_submit_infos[0].pCommandBufferInfos = std::data(next_command_buffers);
-    next_submit_infos[0].commandBufferInfoCount =
-        std::size(next_command_buffers);
-
-    const auto target_semaphore_sequence =
-        1 + queue_context->semaphore_sequence++;
-    const auto tail_ssi = VkSemaphoreSubmitInfo{
-        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
-        .semaphore = queue_context->semaphore,
-        .value = target_semaphore_sequence,
-        .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
-    };
-    const auto tail_cbsi = VkCommandBufferSubmitInfo{
-        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
-        .commandBuffer = tail_cb,
-    };
-    next_submit_infos.push_back(VkSubmitInfo2{
-        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
-        .commandBufferInfoCount = 1,
-        .pCommandBufferInfos = &tail_cbsi,
-        .signalSemaphoreInfoCount = 1,
-        .pSignalSemaphoreInfos = &tail_ssi,
-    });
-
-    if (const auto res =
-            vtable.QueueSubmit2(queue, std::size(next_submit_infos),
-                                std::data(next_submit_infos), fence);
-        res != VK_SUCCESS) {
-        return res;
-    }
+    for (const auto& submit_info : std::span{submit_infos, submit_count}) {
+        const auto head_handle = queue_context->timestamp_pool->acquire();
+        const auto tail_handle = queue_context->timestamp_pool->acquire();
+
+        next_cbs.emplace_back([&]() -> auto {
+            auto cbs = std::make_unique<cb_vect_t>();
+            head_handle->setup_command_buffers(*tail_handle, *queue_context);
+            cbs->push_back(VkCommandBufferSubmitInfo{
+                .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+                .commandBuffer = head_handle->command_buffer,
+            });
+            std::ranges::copy_n(submit_info.pCommandBufferInfos,
+                                submit_info.commandBufferInfoCount,
+                                std::back_inserter(*cbs));
+            cbs->push_back(VkCommandBufferSubmitInfo{
+                .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+                .commandBuffer = tail_handle->command_buffer,
+            });
+            return cbs;
+        }());
 
-    queue_context->notify_submit({submit_infos, submit_count},
-                                 target_semaphore_sequence,
-                                 std::move(timestamp_handle));
+        next_submits.push_back(submit_info);
+        next_submits.back().pCommandBufferInfos = std::data(*next_cbs.back());
+        next_submits.back().commandBufferInfoCount =
+            std::size(*next_cbs.back());
 
-    return VK_SUCCESS;
-    */
+        queue_context->notify_submit(submit_info, head_handle, tail_handle);
+
+        handles.push_back(head_handle);
+        handles.push_back(tail_handle);
+    }
+
+    return vtable.QueueSubmit2(queue, std::size(next_submits),
+                               std::data(next_submits), fence);
 }
 
 static VKAPI_ATTR VkResult VKAPI_CALL
@@ -621,10 +570,10 @@ vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) {
         queue_context->notify_present(*present_info);
     }
 
-    if (const auto sleep_time = queue_context->get_delay_time();
-        sleep_time.has_value()) {
+    if (const auto sleep_until = queue_context->get_sleep_until();
+        sleep_until.has_value()) {
 
-        std::this_thread::sleep_for(*sleep_time);
+        std::this_thread::sleep_until(*sleep_until);
     }
 
     return VK_SUCCESS;
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 99cf51e..2b79b53 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -2,8 +2,10 @@
 #include "device_context.hh"
 #include "timestamp_pool.hh"
 
+#include <algorithm>
 #include <chrono>
 #include <iostream>
+#include <span>
 
 namespace low_latency {
 
@@ -67,7 +69,7 @@ QueueContext::~QueueContext() {
 }
 
 void QueueContext::notify_submit(
-    const VkSubmitInfo& info, const std::uint64_t& target_semaphore_sequence,
+    const VkSubmitInfo& info,
     const std::shared_ptr<TimestampPool::Handle> head_handle,
     const std::shared_ptr<TimestampPool::Handle> tail_handle) {
 
@@ -79,8 +81,7 @@ void QueueContext::notify_submit(
                         std::inserter(signals, std::end(signals)));
 
     this->submissions.emplace_back(std::make_unique<Submission>(
-        std::move(signals), std::move(waits), target_semaphore_sequence,
-        head_handle, tail_handle));
+        std::move(signals), std::move(waits), head_handle, tail_handle));
 
     // TODO HACK
     if (std::size(this->submissions) > 100) {
@@ -88,40 +89,42 @@ void QueueContext::notify_submit(
     }
 }
 
-/*
 void QueueContext::notify_submit(
-    std::span<const VkSubmitInfo2> infos,
-    const std::uint64_t target_semaphore_sequence,
-    std::shared_ptr<TimestampPool::Handle>&& handle) {
+    const VkSubmitInfo2& info,
+    const std::shared_ptr<TimestampPool::Handle> head_handle,
+    const std::shared_ptr<TimestampPool::Handle> tail_handle) {
 
     auto signals = std::unordered_set<VkSemaphore>{};
     auto waits = std::unordered_set<VkSemaphore>{};
 
-    for (const auto& info : infos) {
-        constexpr auto get_semaphore = [](const auto& semaphore_info) {
-            return semaphore_info.semaphore;
-        };
-        std::ranges::transform(info.pSignalSemaphoreInfos,
-                               std::next(info.pSignalSemaphoreInfos,
-                                         info.signalSemaphoreInfoCount),
-                               std::inserter(signals, std::end(signals)),
-                               get_semaphore);
-        std::ranges::transform(
-            info.pWaitSemaphoreInfos,
-            std::next(info.pWaitSemaphoreInfos, info.waitSemaphoreInfoCount),
-            std::inserter(waits, std::end(waits)), get_semaphore);
+    std::ranges::transform(
+        std::span{info.pWaitSemaphoreInfos, info.waitSemaphoreInfoCount},
+        std::inserter(waits, std::end(waits)),
+        [](const auto& info) -> auto { return info.semaphore; });
+
+    std::ranges::transform(
+        std::span{info.pSignalSemaphoreInfos, info.signalSemaphoreInfoCount},
+        std::inserter(signals, std::end(signals)),
+        [](const auto& info) -> auto { return info.semaphore; });
+
+    std::cerr << "submit2 notif for queue " << this->queue << '\n';
+    std::cerr << "    signals: \n";
+    for (const auto& signal : signals) {
+        std::cerr << "      " << signal << '\n';
+    }
+    std::cerr << "    waits: \n";
+    for (const auto& wait : waits) {
+        std::cerr << "      " << wait << '\n';
     }
 
     this->submissions.emplace_back(std::make_unique<Submission>(
-        std::move(signals), std::move(waits), target_semaphore_sequence,
-        std::move(handle)));
+        std::move(signals), std::move(waits), head_handle, tail_handle));
 
     // TODO HACK
     if (std::size(this->submissions) > 100) {
         this->submissions.pop_front();
     }
 }
-*/
 
 void QueueContext::notify_present(const VkPresentInfoKHR& info) {
 
@@ -153,7 +156,7 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
         return collected_semaphores;
     }();
 
-    const auto start_iter = std::ranges::find_if(
+    const auto acquire_iter = std::ranges::find_if(
         std::rbegin(this->submissions), std::rend(this->submissions),
         [&](const auto& submission) {
             return std::ranges::any_of(
@@ -162,13 +165,13 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
                 });
         });
 
-    if (start_iter == std::rend(this->submissions)) {
-        std::cout << "couldn't find starting submission!\n";
+    if (acquire_iter == std::rend(this->submissions)) {
+        std::cerr << "couldn't find starting submission!\n";
         return;
     }
-    const auto& start = *start_iter;
+    const auto& acquire = *acquire_iter;
 
-    const auto end_iter = std::ranges::find_if(
+    const auto present_iter = std::ranges::find_if(
         std::rbegin(this->submissions), std::rend(this->submissions),
         [&](const auto& submission) {
             return std::ranges::any_of(
@@ -176,43 +179,61 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
                 [&](const auto& signal) { return waits.contains(signal); });
         });
 
-    if (end_iter == std::rend(this->submissions)) {
-        std::cout << "couldn't find ending submission!\n";
+    if (present_iter == std::rend(this->submissions)) {
+        std::cerr << "couldn't find ending submission!\n";
         return;
     }
-    const auto& end = *end_iter;
+    const auto& end = *present_iter;
+
+    std::cerr << "present for queue: " << queue << ", our waits:\n";
+    for (const auto& wait : waits) {
+        std::cerr << "      " << wait << '\n';
+    }
+
+    // The work including and between acquire -> present is effectively
+    // guaranteed to contribute to our frame. We are going to mark this point
+    // for future queues to read the 'start of frame' from.
+    (*present_iter)->end_of_frame_marker = true;
+
+    // Now we read backwards to try to find our true start, starting at our
+    // acquire.
+    const auto start_iter = std::prev(std::ranges::find_if(
+        std::next(acquire_iter), std::rend(this->submissions),
+        [](const auto& submission) {
+            return submission->end_of_frame_marker;
+        }));
+    const auto& start = *start_iter;
+
+    // start iter can't be end cause it's prev'd.
 
     auto frame = Frame{.start =
                            Frame::Timepoint{
                                .context = *this,
                                .handle = start->start_handle,
-                               .sequence = start->sequence,
                            },
                        .end = Frame::Timepoint{
                            .context = *this,
                            .handle = end->end_handle,
-                           .sequence = end->sequence,
                        }};
     this->in_flight_frames.emplace_back(
         std::make_unique<Frame>(std::move(frame)));
-
-    // hack
-    if (this->in_flight_frames.size() > 5) {
-        this->in_flight_frames.pop_front();
-    }
 }
 
-std::optional<QueueContext::duration_t> QueueContext::get_delay_time() {
+const auto debug_log_time = [](const auto& diff) {
+    using namespace std::chrono;
+    const auto ms = duration_cast<milliseconds>(diff);
+    const auto us = duration_cast<microseconds>(diff - ms);
+    const auto ns = duration_cast<nanoseconds>(diff - ms - us);
+    std::cerr << ms << " " << us << " " << ns << "\n";
+};
+
+void QueueContext::process_frames() {
     if (!std::size(this->in_flight_frames)) {
-        return std::nullopt;
+        return;
     }
 
-    // We are about to query the wait semaphores of all of our current
-    // frames in flight. They may come from the same device, so we're going
-    // to build a mapping here to reduce vulkan calls. Not only that,
-    // we have to do this or else our timing information becomes broken
-    // as this loop iterates.
-    const auto target_devices = [this]() -> auto {
+    // Collect all devices and call calibrate.
+    [this]() -> auto {
         using context_ref_t = std::reference_wrapper<DeviceContext>;
         auto target_devices = std::unordered_map<VkDevice, context_ref_t>{};
         for (const auto& frame : this->in_flight_frames) {
@@ -222,237 +243,139 @@ std::optional<QueueContext::duration_t> QueueContext::get_delay_time() {
             target_devices.try_emplace(start.device, std::ref(start));
             target_devices.try_emplace(end.device, std::ref(end));
         }
-        return target_devices;
-    }();
-
-    // Calibrate timestamps before we acquire semaphores.
-    for (const auto& pair : target_devices) {
-        auto& device = pair.second;
-        device_context.clock.calibrate();
-    }
-
-    // Now we have all owned devices and their clocks are in a good state.
-    // We need to build another mapping of semaphores to their queries now.
-    const auto queue_sequences = [this]() -> auto {
-        auto queue_sequences = std::unordered_map<VkQueue, std::uint64_t>{};
-        for (const auto& frame : this->in_flight_frames) {
-            auto& start = frame->start.context;
-            auto& end = frame->end.context;
-
-            for (const auto& queue_ptr : {&start, &end}) {
-                if (queue_sequences.contains(queue_ptr->queue)) {
-                    continue;
-                }
-
-                const auto& vtable = queue_ptr->device_context.vtable;
-                auto seq = std::uint64_t{};
-                vtable.GetSemaphoreCounterValueKHR(this->device_context.device,
-                                                   this->semaphore, &seq);
-                queue_sequences.emplace(queue_ptr->queue, seq);
-            }
+        for (const auto& pair : target_devices) {
+            auto& device = pair.second.get();
+            device.clock.calibrate();
         }
-        return queue_sequences;
     }();
 
-    // Now all devices we are about to query are primed to query.
-    // We have all sequence numbers from all queus we could possibly query.
-    const auto S = std::size(this->in_flight_frames);
-    for (auto i = std::size_t{0}; i < S; ++i) {
-        assert(this->in_flight_frames[i]);
-        const auto& frame = *this->in_flight_frames[i];
-        const auto& start = frame.start;
-        const auto& end = frame.end;
-
-        std::cout << "    Evaluating the frame that's " << S - i - 1
-                  << " behind\n";
-
-        std::cout << "    target start seq: " << start.sequence << '\n';
-        std::cout << "    target end seq: " << end.sequence << '\n';
-
-        const auto start_seq_it = queue_sequences.find(start.context.queue);
-        assert(start_seq_it != std::end(queue_sequences));
-        const auto& start_seq = start_seq_it->second;
-        if (start_seq < start.sequence) {
-            std::cout << "        frame hasn't started yet !\n ";
-            continue;
-        }
+    const auto get_tick_time = [](const auto& timepoint)
+        -> std::optional<DeviceContext::Clock::time_point_t> {
+        const auto& handle = timepoint.handle;
+        const auto& context = timepoint.context;
 
-        /*
-        const auto start_ticks_opt =
-            start.handle->get_ticks(*start.context.timestamp_pool);
-        if (!start_ticks_opt.has_value()) {
-            std::cout << "        frame hasn't started yet !\n ";
+        const auto ticks = handle->get_ticks(*context.timestamp_pool);
+        if (!ticks.has_value()) {
+            return std::nullopt;
         }
+        const auto& clock = context.device_context.clock;
+        return clock.ticks_to_time(*ticks);
+    };
 
-        std::cout << "        START TICKS: " << start_ticks << '\n';
-        const auto start_time =
-            start.context.device_context.clock.ticks_to_time(start_ticks);
-
-        {
-            using namespace std::chrono;
-            const auto diff = now - a;
-            const auto ms = duration_cast<milliseconds>(diff);
-            const auto us = duration_cast<microseconds>(diff - ms);
-            const auto ns = duration_cast<nanoseconds>(diff - ms - us);
-            std::cout << "        frame started: " << ms << " ms " << us
-                      << " us " << ns << " ns ago\n";
-        }
+    std::cerr << "starting frame readout\n";
+    while (std::size(this->in_flight_frames)) {
+        const auto& frame = this->in_flight_frames.front();
+        assert(frame);
 
-        const auto end_seq_it = queue_sequences.find(end.context.queue);
-        assert(end_seq_it != std::end(queue_sequences));
-        const auto& end_seq = end_seq_it->second;
-        if (start_seq < end.sequence) {
-            std::cout << "        frame hasn't started yet !\n ";
-            continue;
+        const auto a = get_tick_time(frame->start);
+        if (!a.has_value()) {
+            break;
         }
-        */
-    }
-
-    return std::nullopt;
-    //
-}
 
-// now it's all coming together
-// std::optional<QueueContext::duration_t> QueueContext::get_delay_time() {
-/*
-if (!std::size(this->in_flight_frames)) {
-    return std::nullopt;
-}
-
-auto seq = std::uint64_t{};
-this->device_context.vtable.GetSemaphoreCounterValueKHR(
-    this->device_context.device, this->semaphore, &seq);
-
-// Get semaphore first, then poll!
-this->timestamp_pool->poll();
-
-// idk how frequently we should call this.
-this->device_context.calibrate_timestamps();
+        const auto b = get_tick_time(frame->end);
+        if (!b.has_value()) {
+            break;
+        }
 
-static auto gpu_frametimes = std::deque<uint64_t>{};
-static auto cpu_frametimes = std::deque<uint64_t>{};
+        // assert(a <= b);
 
-const auto S = std::size(this->in_flight_frames);
+        //
+        const auto last_b =
+            this->timings.empty() ? *a : this->timings.back()->gpu_end;
 
-std::cout << "\nSTART FRAME READOUT\n";
-std::cout << "error bound: " << this->device_context.clock.error_bound
-          << '\n';
-std::cout << "num frames in flight: " << S << '\n';
-std::cout << "from oldest -> newest\n";
+        // assert(last_b <= a);
 
-// const auto b_seq = semaphore_from_context(*this);
-const auto now = std::chrono::steady_clock::now();
+        const auto frametime = *b - last_b;
 
-auto i = std::size_t{0};
-for (; i < std::size(this->in_flight_frames); ++i) {
-    const auto& frame = this->in_flight_frames[i];
-    std::cout << "    Evaluating the frame that's " << S - i - 1
-              << " behind\n";
-    if (!frame) {
-        std::cout << "        nullptr!\n";
-        continue;
-    }
+        std::cerr
+            << "        calculated total time from last frame (frametime): ";
+        debug_log_time(*b - last_b);
 
-    std::cout << "    target start: " << frame->target_start_sequence <<
-'\n'; std::cout << "    target end: " << frame->target_end_sequence << '\n'; if
-(seq < frame->target_start_sequence) { std::cout << "        frame hasn't
-started yet!\n"; continue;
-    }
+        this->timings.emplace_back(std::make_unique<Timing>(Timing{
+            .gpu_start = *a,
+            .gpu_end = *b,
+            .frametime = frametime,
+        }));
 
-    const auto start_ticks =
-        frame->start_context.timestamp_pool->get_polled(*frame->start);
-    std::cout << "        START TICKS: " << start_ticks << '\n';
-    const auto& a_clock = frame->start_context.device_context.clock;
-    const auto a = a_clock.ticks_to_time(start_ticks);
-
-    {
-        using namespace std::chrono;
-        const auto diff = now - a;
-        const auto ms = duration_cast<milliseconds>(diff);
-        const auto us = duration_cast<microseconds>(diff - ms);
-        const auto ns = duration_cast<nanoseconds>(diff - ms - us);
-        std::cout << "        frame started: " << ms << " ms " << us
-                  << " us " << ns << " ns ago\n";
-    }
-
-    if (seq < frame->target_end_sequence) {
-        std::cout << "        frame hasn't ended yet!\n";
-        continue;
-    }
-
-
-    const auto end_ticks =
-        frame->end_context.timestamp_pool->get_polled(*frame->end, true);
-    const auto& b_clock = frame->end_context.device_context.clock;
-    std::cout << "        END_TICKS: " << end_ticks << '\n';
-    const auto b = b_clock.ticks_to_time(end_ticks);
-    {
-        using namespace std::chrono;
-        if (now <= b) {
-            std::cout << "b happened before now?\n";
-        }
-        const auto diff = now - b;
-        const auto ms = duration_cast<milliseconds>(diff);
-        const auto us = duration_cast<microseconds>(diff - ms);
-        const auto ns = duration_cast<nanoseconds>(diff - ms - us);
-        std::cout << "        frame ended: " << ms << " ms " << us
-                  << " us " << ns << " ns ago\n";
+        this->in_flight_frames.pop_front();
     }
 
-    const auto gpu_time = b - a;
-    {
-        using namespace std::chrono;
-        const auto diff = gpu_time;
-        const auto ms = duration_cast<milliseconds>(diff);
-        const auto us = duration_cast<microseconds>(diff - ms);
-        const auto ns = duration_cast<nanoseconds>(diff - ms - us);
-        std::cout << "        gpu_time: " << ms << " ms " << us
-                  << " us " << ns << " ns ago\n";
+    const auto MAX_TRACKED = 50;
+    if (std::size(this->timings) < MAX_TRACKED) {
+        return;
     }
-
-    /*
-    cpu_frametimes.emplace_back(cpu_time);
-    gpu_frametimes.emplace_back(gpu_time);
-}
-
-/*
-if (remove_index.has_value()) {
-    this->in_flight_frames.erase(std::begin(this->in_flight_frames),
-                                 std::begin(this->in_flight_frames) +
-                                     *remove_index);
+    this->timings.erase(std::begin(this->timings),
+                        std::next(std::begin(this->timings),
+                                  std::size(this->timings) - MAX_TRACKED));
 }
-*/
-
-/*
-auto g_copy = gpu_frametimes;
-auto c_copy = cpu_frametimes;
-std::ranges::sort(g_copy);
-std::ranges::sort(c_copy);
-
-constexpr auto N = 49;
-if (std::size(cpu_frametimes) < N) {
-    return std::nullopt;
-}
-
-const auto F = std::size(g_copy);
-// close enough to median lol
-const auto g = g_copy[F / 2];
-const auto c = c_copy[F / 2];
 
-std::cout << g << '\n';
+using opt_time_point_t = std::optional<DeviceContext::Clock::time_point_t>;
+opt_time_point_t QueueContext::get_sleep_until() {
 
-std::cout << "    median gpu: " << (g / 1'000'000) << " ms " << g / 1'000
-          << " us " << g << " ns\n";
-std::cout << "    median cpu: " << c / 1'000'000 << " ms " << c / 1'000
-          << " us " << c << " ns\n";
+    // Call this to push all in flight frames into our timings structure,
+    // but only if they're completed. So now they are truly *in flight frames*.
+    this->process_frames();
+    
+    // We have completed all frames. DO NOT WAIT!
+    if (!std::size(this->in_flight_frames)) {
+        return std::nullopt;
+    }
 
-if (F > N) {
-    gpu_frametimes.pop_front();
-    cpu_frametimes.pop_front();
-}
+    const auto median_frametime = [&, this]() {
+        auto vect = std::vector<Timing*>{};
+        std::ranges::transform(this->timings, std::back_inserter(vect),
+                               [](const auto& timing) { return timing.get(); });
+        std::ranges::sort(vect, [](const auto& a, const auto& b) {
+            return a->frametime < b->frametime;
+        });
+        return vect[std::size(vect) / 2]->frametime;
+    }();
 
-return std::nullopt;
+    //                                    PRESENT CALL
+    // | -------x----- | -------x--------------|
+    // ^ last_b        ^ a                     ^ b
+    //
+    // Us, the CPU on the host, is approximately at 'b'.
+    // We have a good guess for the distance between
+    // last_b and b (median_frametime).
+    // The GPU is at any point on this line (marked as x).
+    // Don't use A. It's less robust than just using last_b.
+    // It *might* be more accurate because it's closer,
+    // but there's an issue where there can sometimes be a very
+    // small distance between a and b because it is just the
+    // point in time when the vkAcquireSwapchainKHR signals
+    // the wait on the gpu queue, which can sometimes be tiny.
+
+    std::cerr << "    median 100 frametimes: ";
+    debug_log_time(median_frametime);
+
+    // 2% of average gpu time for dealing with variance.
+    // This could be calculated more precisely with the
+    // numbers we have (like we could construct a high% confidence
+    // interval? not big on maths).
+    const auto slack = median_frametime / 50;
+
+    // If we're more than 1 frame queued, then we should wait for
+    // that to complete before returning. It's likely way better to
+    // to sleep twice here and recompute between sleeps because we're
+    // extrapolating really far into the future here! TODO
+    const auto extra_delay =
+        median_frametime * (std::size(this->in_flight_frames) - 1);
+
+    const auto& last_b = this->timings.back()->gpu_end;
+
+    // All educated guesses:
+    //  dist_to_b = frametime - dist_to_last_b;
+    //  dist_to_last_b = now - last_b
+    //  sleep_until = now + extra_delay + slack + dist_to_b
+    //              = now + extra_delay + slack + (frametime - dist_to_last_b)
+    //              = now + extra_delay + slack + frametime - (now - last_b)
+
+    const auto now = std::chrono::steady_clock::now();
+    assert(last_b <= now);
+    const auto dist = now - last_b;
+    // Even if this is negative, it's a no-op to sleep backwards.
+    return now + extra_delay + slack + median_frametime - dist;
 }
-*/
 
 } // namespace low_latency
 \ No newline at end of file
diff --git a/src/queue_context.hh b/src/queue_context.hh
index 3df6af4..6a71754 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -2,6 +2,7 @@
 #define QUEUE_STATE_HH_
 
 #include "context.hh"
+#include "device_context.hh"
 #include "timestamp_pool.hh"
 
 #include <vulkan/utility/vk_dispatch_table.h>
@@ -14,8 +15,6 @@
 
 namespace low_latency {
 
-class DeviceContext;
-
 class QueueContext final : public Context {
   public:
     DeviceContext& device_context;
@@ -23,6 +22,8 @@ class QueueContext final : public Context {
     const VkQueue queue;
     const std::uint32_t queue_family_index;
 
+    // I used to use these to signal when we could read timestamps until
+    // I realised you could use hostQueryReset.
     std::uint64_t semaphore_sequence = 0;
     VkSemaphore semaphore;
 
@@ -30,14 +31,17 @@ class QueueContext final : public Context {
 
     std::unique_ptr<TimestampPool> timestamp_pool;
 
+  private:
+    static constexpr auto MAX_TRACKED_TIMINGS = 50;
     // Potentially in flight queue submissions
     struct Submission {
         const std::unordered_set<VkSemaphore> signals;
         const std::unordered_set<VkSemaphore> waits;
-        const std::uint64_t sequence;
 
         const std::shared_ptr<TimestampPool::Handle> start_handle;
         const std::shared_ptr<TimestampPool::Handle> end_handle;
+
+        bool end_of_frame_marker = false;
     };
     std::deque<std::shared_ptr<Submission>> submissions;
 
@@ -48,7 +52,6 @@ class QueueContext final : public Context {
         struct Timepoint {
             const QueueContext& context;
             const std::shared_ptr<TimestampPool::Handle> handle;
-            const std::uint64_t sequence;
         };
 
         const Timepoint start;
@@ -56,6 +59,20 @@ class QueueContext final : public Context {
     };
     std::deque<std::unique_ptr<Frame>> in_flight_frames;
 
+    struct Timing {
+
+        DeviceContext::Clock::time_point_t gpu_start;
+        DeviceContext::Clock::time_point_t gpu_end;
+
+        // Distance between the last gpu_end and this one.
+        // So one entire go around, including all cpu and gpu.
+        DeviceContext::Clock::time_point_t::duration frametime;
+    };
+    std::deque<std::unique_ptr<Timing>> timings;
+
+  private:
+    void process_frames();
+
   public:
     QueueContext(DeviceContext& device_context, const VkQueue& queue,
                  const std::uint32_t& queue_family_index);
@@ -63,18 +80,19 @@ class QueueContext final : public Context {
 
   public:
     void
-    notify_submit(const VkSubmitInfo& info, const std::uint64_t& sequence,
+    notify_submit(const VkSubmitInfo& info,
                   const std::shared_ptr<TimestampPool::Handle> head_handle,
                   const std::shared_ptr<TimestampPool::Handle> tail_handle);
 
-    // TODO submit2
+    void
+    notify_submit(const VkSubmitInfo2& info,
+                  const std::shared_ptr<TimestampPool::Handle> head_handle,
+                  const std::shared_ptr<TimestampPool::Handle> tail_handle);
 
     void notify_present(const VkPresentInfoKHR& info);
 
   public:
-    // Computes the amount we should delay...
-    using duration_t = std::chrono::steady_clock::duration;
-    std::optional<duration_t> get_delay_time();
+    std::optional<DeviceContext::Clock::time_point_t> get_sleep_until();
 };
 
 }; // namespace low_latency