7 files changed, 527 insertions, 381 deletions
diff --git a/src/device_context.cc b/src/device_context.cc
index 4b39210..f849df1 100644
--- a/src/device_context.cc
+++ b/src/device_context.cc
@@ -31,50 +31,58 @@ void DeviceContext::notify_acquire(const VkSwapchainKHR& swapchain,
     it->second.insert_or_assign(image_index, signal_semaphore);
 }
 
-DeviceContext::Clock::Clock(const DeviceContext& context) {
+DeviceContext::Clock::Clock(const DeviceContext& context) : device(context) {
+    this->calibrate();
+}
+
+DeviceContext::Clock::~Clock() {}
 
+void DeviceContext::Clock::calibrate() {
     const auto infos = std::vector<VkCalibratedTimestampInfoKHR>{
         {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr,
          VK_TIME_DOMAIN_DEVICE_EXT},
         {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr,
          VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT}};
 
-    auto device_host = std::array<std::uint64_t, 2>{};
+    struct CalibratedResult {
+        std::uint64_t device;
+        std::uint64_t host;
+    };
+    auto calibrated_result = CalibratedResult{};
 
+    // we probably want to use this instead bc clock_gettime isn't guaranteed
+    // by steady clock afaik
+    /*
+    struct timespec tv;
+    clock_gettime(CLOCK_MONOTONIC, &tv);
+    return tv.tv_nsec + tv.tv_sec*1000000000ull;
+    */
     const auto steady_before = std::chrono::steady_clock::now();
-    context.vtable.GetCalibratedTimestampsKHR(
-        context.device, 2, std::data(infos), std::data(device_host),
-        &this->error_bound);
+    device.vtable.GetCalibratedTimestampsKHR(device.device, 2, std::data(infos),
+                                             &calibrated_result.device,
+                                             &this->error_bound);
     const auto steady_after = std::chrono::steady_clock::now();
 
     this->cpu_time = steady_before + (steady_after - steady_before) / 2;
-    this->device_ticks = device_host[0];
-    this->host_ns = device_host[1];
+    this->device_ticks = calibrated_result.device;
+    this->host_ns = calibrated_result.host;
 
-    // Might need to get physical limits again?
-    this->ticks_per_ns =
-        context.physical_device.properties->limits.timestampPeriod;
+    // Might need to get physical limits every now and then?
+    const auto& pd = device.physical_device.properties;
+    this->ticks_per_ns = pd->limits.timestampPeriod;
 }
 
 DeviceContext::Clock::time_point_t
 DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const {
-    /*
-    struct timespec tv;
-    clock_gettime(CLOCK_MONOTONIC, &tv);
-    return tv.tv_nsec + tv.tv_sec*1000000000ull;
-    */
-
     auto a = this->device_ticks;
     auto b = ticks;
-
     const auto was_before = a > b;
     if (was_before) { // it's happened before
         std::swap(a, b);
     }
+
     const auto nsec = std::chrono::nanoseconds((b - a) * this->ticks_per_ns);
     return this->cpu_time + (was_before ? -nsec : nsec);
 }
 
-void DeviceContext::calibrate_timestamps() { this->clock = Clock{*this}; }
-
 } // namespace low_latency
 \ No newline at end of file
diff --git a/src/device_context.hh b/src/device_context.hh
index b55b70c..c08cec2 100644
--- a/src/device_context.hh
+++ b/src/device_context.hh
@@ -35,8 +35,11 @@ struct DeviceContext final : public Context {
     std::unordered_map<VkSwapchainKHR, index_semaphores_t> swapchain_signals;
 
     struct Clock {
+      public:
         using time_point_t = std::chrono::steady_clock::time_point;
+        const DeviceContext& device;
 
+      public:
         time_point_t cpu_time;
         std::uint64_t error_bound;
         std::uint64_t device_ticks;
@@ -45,7 +48,10 @@ struct DeviceContext final : public Context {
 
       public:
         Clock(const DeviceContext& device);
-        
+        ~Clock();
+
+      public:
+        void calibrate();
         time_point_t ticks_to_time(const std::uint64_t& ticks) const;
     };
     Clock clock;
@@ -61,9 +67,6 @@ struct DeviceContext final : public Context {
     void notify_acquire(const VkSwapchainKHR& swapchain,
                         const std::uint32_t& image_index,
                         const VkSemaphore& signal_semaphore);
-
-  public:
-    void calibrate_timestamps();
 };
 
 }; // namespace low_latency
diff --git a/src/layer.cc b/src/layer.cc
index c521bb9..1b1d9e7 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -1,9 +1,12 @@
 #include "layer.hh"
 
+#include <memory>
+#include <span>
 #include <string_view>
 #include <thread>
 #include <unordered_map>
 #include <utility>
+#include <vector>
 
 #include <vulkan/utility/vk_dispatch_table.h>
 #include <vulkan/vk_layer.h>
@@ -224,7 +227,8 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
         const auto wanted_extensions = {
             VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME,
             VK_KHR_CALIBRATED_TIMESTAMPS_EXTENSION_NAME,
-            VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME};
+            VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME,
+            VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME};
 
         for (const auto& wanted : wanted_extensions) {
 
@@ -274,7 +278,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
 
         return result;
     }
-
+    
 #define DEVICE_VTABLE_LOAD(name)                                               \
     .name = reinterpret_cast<PFN_vk##name>(gdpa(*pDevice, "vk" #name))
     auto vtable = VkuDeviceDispatchTable{
@@ -294,9 +298,9 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
         DEVICE_VTABLE_LOAD(BeginCommandBuffer),
         DEVICE_VTABLE_LOAD(EndCommandBuffer),
         DEVICE_VTABLE_LOAD(ResetCommandBuffer),
-        DEVICE_VTABLE_LOAD(CmdResetQueryPool),
         DEVICE_VTABLE_LOAD(CmdDraw),
         DEVICE_VTABLE_LOAD(CmdDrawIndexed),
+        DEVICE_VTABLE_LOAD(CmdResetQueryPool),
         DEVICE_VTABLE_LOAD(GetDeviceQueue2),
         DEVICE_VTABLE_LOAD(QueueSubmit2),
         DEVICE_VTABLE_LOAD(AcquireNextImageKHR),
@@ -306,6 +310,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
         DEVICE_VTABLE_LOAD(CmdWriteTimestamp2KHR),
         DEVICE_VTABLE_LOAD(QueueSubmit2KHR),
         DEVICE_VTABLE_LOAD(GetCalibratedTimestampsKHR),
+        DEVICE_VTABLE_LOAD(ResetQueryPoolEXT),
     };
 #undef DEVICE_VTABLE_LOAD
 
@@ -442,61 +447,81 @@ static VKAPI_ATTR VkResult VKAPI_CALL vkAcquireNextImage2KHR(
 
 static VKAPI_ATTR VkResult VKAPI_CALL
 vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
-              const VkSubmitInfo* submit_info, VkFence fence) {
+              const VkSubmitInfo* submit_infos, VkFence fence) {
 
     const auto& queue_context = layer_context.get_context(queue);
     const auto& vtable = queue_context->device_context.vtable;
 
     if (!submit_count) { // no-op submit we shouldn't worry about
-        return vtable.QueueSubmit(queue, submit_count, submit_info, fence);
+        return vtable.QueueSubmit(queue, submit_count, submit_infos, fence);
     }
 
-    // Create a new vector of submit infos.
-    auto next_submit_infos = std::vector<VkSubmitInfo>{};
-
-    auto timestamp_handle = queue_context->timestamp_pool->acquire();
-    timestamp_handle->setup_command_buffers(vtable);
-
-    const auto& [head_cb, tail_cb] = timestamp_handle->command_buffers;
+    // We have to avoid casting away the const* of the passed VkSubmitInfos.
+    // We wrap every single submission with *two* extra VkSubmitInfos to
+    // accomplish this. The first executes a command buffer that 
+
+    using cb_vect = std::vector<VkCommandBuffer>;
+    using tssi_ptr_t = std::unique_ptr<VkTimelineSemaphoreSubmitInfo>;
+    auto next_submits = std::vector<VkSubmitInfo>{};
+    auto next_cbs = std::vector<std::unique_ptr<cb_vect>>{};
+    auto next_signals = std::vector<std::unique_ptr<std::uint64_t>>{};
+    auto next_tssis = std::vector<tssi_ptr_t>{};
+    auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{};
+
+    for (const auto& submit_info : std::span{submit_infos, submit_count}) {
+        const auto head_handle = queue_context->timestamp_pool->acquire();
+        const auto tail_handle = queue_context->timestamp_pool->acquire();
+
+        // Head is special as we need to inject a CB into a copy of
+        // their command buffers that records the time the waits completed.
+        next_cbs.emplace_back([&]() -> auto {
+            auto cbs = std::make_unique<std::vector<VkCommandBuffer>>();
+            head_handle->setup_command_buffers(*tail_handle, *queue_context);
+            cbs->push_back(head_handle->command_buffer);
+            std::ranges::copy_n(submit_info.pCommandBuffers,
+                                submit_info.commandBufferCount,
+                                std::back_inserter(*cbs));
+            cbs->push_back(tail_handle->command_buffer);
+            return cbs;
+        }());
+        next_submits.push_back(submit_info);
+        next_submits.back().pCommandBuffers = std::data(*next_cbs.back());
+        next_submits.back().commandBufferCount = std::size(*next_cbs.back());
+
+        const auto next_signal = 1 + queue_context->semaphore_sequence++;
+
+        next_signals.push_back(std::make_unique<std::uint64_t>(next_signal));
+
+        next_tssis.push_back(std::make_unique<VkTimelineSemaphoreSubmitInfo>(
+            VkTimelineSemaphoreSubmitInfo{
+                .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR,
+                .signalSemaphoreValueCount = 1,
+                .pSignalSemaphoreValues = next_signals.back().get(),
+            }));
+        next_submits.push_back(VkSubmitInfo{
+            .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+            .pNext = next_tssis.back().get(),
+            .commandBufferCount = 1,
+            .pCommandBuffers = &tail_handle->command_buffer,
+            .signalSemaphoreCount = 1,
+            .pSignalSemaphores = &queue_context->semaphore,
+        });
 
-    const auto next_command_buffers = [&]() -> auto {
-        auto next_command_buffers = std::vector<VkCommandBuffer>{head_cb};
-        std::ranges::copy_n(submit_info[0].pCommandBuffers,
-                            submit_info[0].commandBufferCount,
-                            std::back_inserter(next_command_buffers));
-        return next_command_buffers;
-    }();
+        queue_context->notify_submit(submit_info, next_signal, head_handle,
+                                     tail_handle);
 
-    std::ranges::copy_n(submit_info, submit_count,
-                        std::back_inserter(next_submit_infos));
-    next_submit_infos[0].pCommandBuffers = std::data(next_command_buffers);
-    next_submit_infos[0].commandBufferCount = std::size(next_command_buffers);
-
-    const auto next_signal = 1 + queue_context->semaphore_sequence++;
-    const auto tail_tssi = VkTimelineSemaphoreSubmitInfo{
-        .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR,
-        .signalSemaphoreValueCount = 1,
-        .pSignalSemaphoreValues = &next_signal,
-    };
-    next_submit_infos.push_back(VkSubmitInfo{
-        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
-        .pNext = &tail_tssi,
-        .commandBufferCount = 1,
-        .pCommandBuffers = &tail_cb,
-        .signalSemaphoreCount = 1,
-        .pSignalSemaphores = &queue_context->semaphore,
-    });
+        handles.push_back(head_handle);
+        handles.push_back(tail_handle);
+    }
 
-    if (const auto res =
-            vtable.QueueSubmit(queue, std::size(next_submit_infos),
-                               std::data(next_submit_infos), fence);
+    if (const auto res = vtable.QueueSubmit(queue, std::size(next_submits),
+                                            std::data(next_submits), fence);
         res != VK_SUCCESS) {
 
         return res;
     }
 
-    queue_context->notify_submit(std::span{submit_info, submit_count},
-                                 next_signal, std::move(timestamp_handle));
+    // ?!?
 
     return VK_SUCCESS;
 }
@@ -509,10 +534,12 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
     const auto queue_context = layer_context.get_context(queue);
     const auto& vtable = queue_context->device_context.vtable;
 
-    if (!submit_count) {
+    // TODO
+    if (!submit_count || true) {
         return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
     }
 
+    /*
     auto timestamp_handle = queue_context->timestamp_pool->acquire();
     timestamp_handle->setup_command_buffers(vtable);
     const auto& [head_cb, tail_cb] = timestamp_handle->command_buffers;
@@ -568,6 +595,7 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
                                  std::move(timestamp_handle));
 
     return VK_SUCCESS;
+    */
 }
 
 static VKAPI_ATTR VkResult VKAPI_CALL
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 9b46773..99cf51e 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -67,25 +67,20 @@ QueueContext::~QueueContext() {
 }
 
 void QueueContext::notify_submit(
-    std::span<const VkSubmitInfo> infos,
-    const std::uint64_t target_semaphore_sequence,
-    std::shared_ptr<TimestampPool::Handle>&& handle) {
-
-    // This has an issue where we're collecting all signals and waits and
-    // treating a single submit call as finishing
+    const VkSubmitInfo& info, const std::uint64_t& target_semaphore_sequence,
+    const std::shared_ptr<TimestampPool::Handle> head_handle,
+    const std::shared_ptr<TimestampPool::Handle> tail_handle) {
 
     auto signals = std::unordered_set<VkSemaphore>{};
     auto waits = std::unordered_set<VkSemaphore>{};
-    for (const auto& info : infos) {
-        std::ranges::copy_n(info.pWaitSemaphores, info.waitSemaphoreCount,
-                            std::inserter(waits, std::end(waits)));
-        std::ranges::copy_n(info.pSignalSemaphores, info.signalSemaphoreCount,
-                            std::inserter(signals, std::end(signals)));
-    }
+    std::ranges::copy_n(info.pWaitSemaphores, info.waitSemaphoreCount,
+                        std::inserter(waits, std::end(waits)));
+    std::ranges::copy_n(info.pSignalSemaphores, info.signalSemaphoreCount,
+                        std::inserter(signals, std::end(signals)));
 
     this->submissions.emplace_back(std::make_unique<Submission>(
         std::move(signals), std::move(waits), target_semaphore_sequence,
-        std::move(handle)));
+        head_handle, tail_handle));
 
     // TODO HACK
     if (std::size(this->submissions) > 100) {
@@ -93,6 +88,7 @@ void QueueContext::notify_submit(
     }
 }
 
+/*
 void QueueContext::notify_submit(
     std::span<const VkSubmitInfo2> infos,
     const std::uint64_t target_semaphore_sequence,
@@ -100,6 +96,7 @@ void QueueContext::notify_submit(
 
     auto signals = std::unordered_set<VkSemaphore>{};
     auto waits = std::unordered_set<VkSemaphore>{};
+
     for (const auto& info : infos) {
         constexpr auto get_semaphore = [](const auto& semaphore_info) {
             return semaphore_info.semaphore;
@@ -124,21 +121,18 @@ void QueueContext::notify_submit(
         this->submissions.pop_front();
     }
 }
+*/
 
 void QueueContext::notify_present(const VkPresentInfoKHR& info) {
 
-    auto frame = [&]() -> std::unique_ptr<Frame> {
-        const auto waits = [&]() {
-            auto waits = std::unordered_set<VkSemaphore>{};
-            std::ranges::copy_n(info.pWaitSemaphores, info.waitSemaphoreCount,
-                                std::inserter(waits, std::end(waits)));
-            return waits;
-        }();
-
-        const auto wait_semaphores = std::unordered_set<VkSemaphore>{
-            info.pWaitSemaphores,
-            std::next(info.pWaitSemaphores, info.waitSemaphoreCount)};
+    const auto waits = [&]() {
+        auto waits = std::unordered_set<VkSemaphore>{};
+        std::ranges::copy_n(info.pWaitSemaphores, info.waitSemaphoreCount,
+                            std::inserter(waits, std::end(waits)));
+        return waits;
+    }();
 
+    const auto collected_semaphores = [&info, this]() {
         auto collected_semaphores = std::unordered_set<VkSemaphore>{};
         for (auto i = std::uint32_t{0}; i < info.swapchainCount; ++i) {
             const auto& swapchain = info.pSwapchains[i];
@@ -153,112 +147,147 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
             const auto index_it = swapchain_it->second.find(index);
             assert(index_it != std::end(swapchain_it->second));
 
-            const auto semaphore = index_it->second;
+            const auto& semaphore = index_it->second;
             collected_semaphores.emplace(index_it->second);
         }
+        return collected_semaphores;
+    }();
 
-        const auto start_submission_it = std::ranges::find_if(
-            std::rbegin(this->submissions), std::rend(this->submissions),
-            [&](const auto& submission) {
-                return std::ranges::any_of(
-                    submission->waits, [&](const auto& wait) {
-                        return collected_semaphores.contains(wait);
-                    });
-            });
-
-        if (start_submission_it == std::rend(this->submissions)) {
-            std::cout << "couldn't find starting submission!\n";
-            return nullptr;
-        }
-        const auto& start_submission = *start_submission_it;
-
-        const auto end_submission_it = std::ranges::find_if(
-            std::rbegin(this->submissions), std::rend(this->submissions),
-            [&](const auto& submission) {
-                return std::ranges::any_of(
-                    submission->signals, [&](const auto& signal) {
-                        return wait_semaphores.contains(signal);
-                    });
-            });
-
-        if (end_submission_it == std::rend(this->submissions)) {
-            std::cout << "couldn't find ending submission!\n";
-            return nullptr;
-        }
-        const auto& end_submission = *end_submission_it;
-
-        return std::make_unique<Frame>(Frame{
-            .start_context = *this,
-            .start = start_submission->timestamp_handle,
-            .target_start_sequence =
-                start_submission->target_semaphore_sequence,
-            .end_context = *this,
-            .end = start_submission->timestamp_handle,
-            .target_end_sequence = start_submission->target_semaphore_sequence,
+    const auto start_iter = std::ranges::find_if(
+        std::rbegin(this->submissions), std::rend(this->submissions),
+        [&](const auto& submission) {
+            return std::ranges::any_of(
+                submission->waits, [&](const auto& wait) {
+                    return collected_semaphores.contains(wait);
+                });
         });
-    }();
 
-    this->in_flight_frames.emplace_back(std::move(frame));
-    
+    if (start_iter == std::rend(this->submissions)) {
+        std::cout << "couldn't find starting submission!\n";
+        return;
+    }
+    const auto& start = *start_iter;
+
+    const auto end_iter = std::ranges::find_if(
+        std::rbegin(this->submissions), std::rend(this->submissions),
+        [&](const auto& submission) {
+            return std::ranges::any_of(
+                submission->signals,
+                [&](const auto& signal) { return waits.contains(signal); });
+        });
+
+    if (end_iter == std::rend(this->submissions)) {
+        std::cout << "couldn't find ending submission!\n";
+        return;
+    }
+    const auto& end = *end_iter;
+
+    auto frame = Frame{.start =
+                           Frame::Timepoint{
+                               .context = *this,
+                               .handle = start->start_handle,
+                               .sequence = start->sequence,
+                           },
+                       .end = Frame::Timepoint{
+                           .context = *this,
+                           .handle = end->end_handle,
+                           .sequence = end->sequence,
+                       }};
+    this->in_flight_frames.emplace_back(
+        std::make_unique<Frame>(std::move(frame)));
+
     // hack
     if (this->in_flight_frames.size() > 5) {
         this->in_flight_frames.pop_front();
     }
 }
 
-// now it's all coming together
 std::optional<QueueContext::duration_t> QueueContext::get_delay_time() {
     if (!std::size(this->in_flight_frames)) {
         return std::nullopt;
     }
 
-    auto seq = std::uint64_t{};
-    this->device_context.vtable.GetSemaphoreCounterValueKHR(
-        this->device_context.device, this->semaphore, &seq);
-
-    // Get semaphore first, then poll!
-    this->timestamp_pool->poll();
+    // We are about to query the wait semaphores of all of our current
+    // frames in flight. They may come from the same device, so we're going
+    // to build a mapping here to reduce vulkan calls. Not only that,
+    // we have to do this or else our timing information becomes broken
+    // as this loop iterates.
+    const auto target_devices = [this]() -> auto {
+        using context_ref_t = std::reference_wrapper<DeviceContext>;
+        auto target_devices = std::unordered_map<VkDevice, context_ref_t>{};
+        for (const auto& frame : this->in_flight_frames) {
+            auto& start = frame->start.context.device_context;
+            auto& end = frame->end.context.device_context;
+
+            target_devices.try_emplace(start.device, std::ref(start));
+            target_devices.try_emplace(end.device, std::ref(end));
+        }
+        return target_devices;
+    }();
 
-    // idk how frequently we should call this.
-    this->device_context.calibrate_timestamps();
+    // Calibrate timestamps before we acquire semaphores.
+    for (const auto& pair : target_devices) {
+        auto& device = pair.second;
+        device_context.clock.calibrate();
+    }
 
-    static auto gpu_frametimes = std::deque<uint64_t>{};
-    static auto cpu_frametimes = std::deque<uint64_t>{};
+    // Now we have all owned devices and their clocks are in a good state.
+    // We need to build another mapping of semaphores to their queries now.
+    const auto queue_sequences = [this]() -> auto {
+        auto queue_sequences = std::unordered_map<VkQueue, std::uint64_t>{};
+        for (const auto& frame : this->in_flight_frames) {
+            auto& start = frame->start.context;
+            auto& end = frame->end.context;
+
+            for (const auto& queue_ptr : {&start, &end}) {
+                if (queue_sequences.contains(queue_ptr->queue)) {
+                    continue;
+                }
+
+                const auto& vtable = queue_ptr->device_context.vtable;
+                auto seq = std::uint64_t{};
+                vtable.GetSemaphoreCounterValueKHR(this->device_context.device,
+                                                   this->semaphore, &seq);
+                queue_sequences.emplace(queue_ptr->queue, seq);
+            }
+        }
+        return queue_sequences;
+    }();
 
+    // Now all devices we are about to query are primed to query.
+    // We have all sequence numbers from all queus we could possibly query.
     const auto S = std::size(this->in_flight_frames);
+    for (auto i = std::size_t{0}; i < S; ++i) {
+        assert(this->in_flight_frames[i]);
+        const auto& frame = *this->in_flight_frames[i];
+        const auto& start = frame.start;
+        const auto& end = frame.end;
 
-    std::cout << "\nSTART FRAME READOUT\n";
-    std::cout << "error bound: " << this->device_context.clock.error_bound
-              << '\n';
-    std::cout << "num frames in flight: " << S << '\n';
-    std::cout << "from oldest -> newest\n";
-
-    // const auto b_seq = semaphore_from_context(*this);
-    const auto now = std::chrono::steady_clock::now();
-
-    auto i = std::size_t{0};
-    for (; i < std::size(this->in_flight_frames); ++i) {
-        const auto& frame = this->in_flight_frames[i];
         std::cout << "    Evaluating the frame that's " << S - i - 1
                   << " behind\n";
-        if (!frame) {
-            std::cout << "        nullptr!\n";
+
+        std::cout << "    target start seq: " << start.sequence << '\n';
+        std::cout << "    target end seq: " << end.sequence << '\n';
+
+        const auto start_seq_it = queue_sequences.find(start.context.queue);
+        assert(start_seq_it != std::end(queue_sequences));
+        const auto& start_seq = start_seq_it->second;
+        if (start_seq < start.sequence) {
+            std::cout << "        frame hasn't started yet !\n ";
             continue;
         }
 
-        std::cout << "    target start: " << frame->target_start_sequence << '\n';
-        std::cout << "    target end: " << frame->target_end_sequence << '\n';
-        if (seq < frame->target_start_sequence) {
-            std::cout << "        frame hasn't started yet!\n";
-            continue;
+        /*
+        const auto start_ticks_opt =
+            start.handle->get_ticks(*start.context.timestamp_pool);
+        if (!start_ticks_opt.has_value()) {
+            std::cout << "        frame hasn't started yet !\n ";
         }
 
-        const auto start_ticks =
-            frame->start_context.timestamp_pool->get_polled(*frame->start);
         std::cout << "        START TICKS: " << start_ticks << '\n';
-        const auto& a_clock = frame->start_context.device_context.clock;
-        const auto a = a_clock.ticks_to_time(start_ticks);
-        
+        const auto start_time =
+            start.context.device_context.clock.ticks_to_time(start_ticks);
+
         {
             using namespace std::chrono;
             const auto diff = now - a;
@@ -269,85 +298,161 @@ std::optional<QueueContext::duration_t> QueueContext::get_delay_time() {
                       << " us " << ns << " ns ago\n";
         }
 
-        if (seq < frame->target_end_sequence) {
-            std::cout << "        frame hasn't ended yet!\n";
+        const auto end_seq_it = queue_sequences.find(end.context.queue);
+        assert(end_seq_it != std::end(queue_sequences));
+        const auto& end_seq = end_seq_it->second;
+        if (start_seq < end.sequence) {
+            std::cout << "        frame hasn't started yet !\n ";
             continue;
         }
+        */
+    }
 
+    return std::nullopt;
+    //
+}
 
-        const auto end_ticks =
-            frame->end_context.timestamp_pool->get_polled(*frame->end, true);
-        const auto& b_clock = frame->end_context.device_context.clock;
-        std::cout << "        END_TICKS: " << end_ticks << '\n';
-        const auto b = b_clock.ticks_to_time(end_ticks);
-        {
-            using namespace std::chrono;
-            if (now <= b) {
-                std::cout << "b happened before now?\n";
-            }
-            const auto diff = now - b;
-            const auto ms = duration_cast<milliseconds>(diff);
-            const auto us = duration_cast<microseconds>(diff - ms);
-            const auto ns = duration_cast<nanoseconds>(diff - ms - us);
-            std::cout << "        frame ended: " << ms << " ms " << us
-                      << " us " << ns << " ns ago\n";
-        }
+// now it's all coming together
+// std::optional<QueueContext::duration_t> QueueContext::get_delay_time() {
+/*
+if (!std::size(this->in_flight_frames)) {
+    return std::nullopt;
+}
 
-        const auto gpu_time = b - a;
-        {
-            using namespace std::chrono;
-            const auto diff = gpu_time;
-            const auto ms = duration_cast<milliseconds>(diff);
-            const auto us = duration_cast<microseconds>(diff - ms);
-            const auto ns = duration_cast<nanoseconds>(diff - ms - us);
-            std::cout << "        gpu_time: " << ms << " ms " << us
-                      << " us " << ns << " ns ago\n";
-        }
+auto seq = std::uint64_t{};
+this->device_context.vtable.GetSemaphoreCounterValueKHR(
+    this->device_context.device, this->semaphore, &seq);
 
-        /*
-        cpu_frametimes.emplace_back(cpu_time);
-        gpu_frametimes.emplace_back(gpu_time);
-        */
-    }
+// Get semaphore first, then poll!
+this->timestamp_pool->poll();
 
-    /*
-    if (remove_index.has_value()) {
-        this->in_flight_frames.erase(std::begin(this->in_flight_frames),
-                                     std::begin(this->in_flight_frames) +
-                                         *remove_index);
+// idk how frequently we should call this.
+this->device_context.calibrate_timestamps();
+
+static auto gpu_frametimes = std::deque<uint64_t>{};
+static auto cpu_frametimes = std::deque<uint64_t>{};
+
+const auto S = std::size(this->in_flight_frames);
+
+std::cout << "\nSTART FRAME READOUT\n";
+std::cout << "error bound: " << this->device_context.clock.error_bound
+          << '\n';
+std::cout << "num frames in flight: " << S << '\n';
+std::cout << "from oldest -> newest\n";
+
+// const auto b_seq = semaphore_from_context(*this);
+const auto now = std::chrono::steady_clock::now();
+
+auto i = std::size_t{0};
+for (; i < std::size(this->in_flight_frames); ++i) {
+    const auto& frame = this->in_flight_frames[i];
+    std::cout << "    Evaluating the frame that's " << S - i - 1
+              << " behind\n";
+    if (!frame) {
+        std::cout << "        nullptr!\n";
+        continue;
     }
-    */
 
-    /*
-    auto g_copy = gpu_frametimes;
-    auto c_copy = cpu_frametimes;
-    std::ranges::sort(g_copy);
-    std::ranges::sort(c_copy);
+    std::cout << "    target start: " << frame->target_start_sequence <<
+'\n'; std::cout << "    target end: " << frame->target_end_sequence << '\n'; if
+(seq < frame->target_start_sequence) { std::cout << "        frame hasn't
+started yet!\n"; continue;
+    }
 
-    constexpr auto N = 49;
-    if (std::size(cpu_frametimes) < N) {
-        return std::nullopt;
+    const auto start_ticks =
+        frame->start_context.timestamp_pool->get_polled(*frame->start);
+    std::cout << "        START TICKS: " << start_ticks << '\n';
+    const auto& a_clock = frame->start_context.device_context.clock;
+    const auto a = a_clock.ticks_to_time(start_ticks);
+
+    {
+        using namespace std::chrono;
+        const auto diff = now - a;
+        const auto ms = duration_cast<milliseconds>(diff);
+        const auto us = duration_cast<microseconds>(diff - ms);
+        const auto ns = duration_cast<nanoseconds>(diff - ms - us);
+        std::cout << "        frame started: " << ms << " ms " << us
+                  << " us " << ns << " ns ago\n";
     }
 
-    const auto F = std::size(g_copy);
-    // close enough to median lol
-    const auto g = g_copy[F / 2];
-    const auto c = c_copy[F / 2];
+    if (seq < frame->target_end_sequence) {
+        std::cout << "        frame hasn't ended yet!\n";
+        continue;
+    }
 
-    std::cout << g << '\n';
 
-    std::cout << "    median gpu: " << (g / 1'000'000) << " ms " << g / 1'000
-              << " us " << g << " ns\n";
-    std::cout << "    median cpu: " << c / 1'000'000 << " ms " << c / 1'000
-              << " us " << c << " ns\n";
+    const auto end_ticks =
+        frame->end_context.timestamp_pool->get_polled(*frame->end, true);
+    const auto& b_clock = frame->end_context.device_context.clock;
+    std::cout << "        END_TICKS: " << end_ticks << '\n';
+    const auto b = b_clock.ticks_to_time(end_ticks);
+    {
+        using namespace std::chrono;
+        if (now <= b) {
+            std::cout << "b happened before now?\n";
+        }
+        const auto diff = now - b;
+        const auto ms = duration_cast<milliseconds>(diff);
+        const auto us = duration_cast<microseconds>(diff - ms);
+        const auto ns = duration_cast<nanoseconds>(diff - ms - us);
+        std::cout << "        frame ended: " << ms << " ms " << us
+                  << " us " << ns << " ns ago\n";
+    }
 
-    if (F > N) {
-        gpu_frametimes.pop_front();
-        cpu_frametimes.pop_front();
+    const auto gpu_time = b - a;
+    {
+        using namespace std::chrono;
+        const auto diff = gpu_time;
+        const auto ms = duration_cast<milliseconds>(diff);
+        const auto us = duration_cast<microseconds>(diff - ms);
+        const auto ns = duration_cast<nanoseconds>(diff - ms - us);
+        std::cout << "        gpu_time: " << ms << " ms " << us
+                  << " us " << ns << " ns ago\n";
     }
-    */
 
+    /*
+    cpu_frametimes.emplace_back(cpu_time);
+    gpu_frametimes.emplace_back(gpu_time);
+}
+
+/*
+if (remove_index.has_value()) {
+    this->in_flight_frames.erase(std::begin(this->in_flight_frames),
+                                 std::begin(this->in_flight_frames) +
+                                     *remove_index);
+}
+*/
+
+/*
+auto g_copy = gpu_frametimes;
+auto c_copy = cpu_frametimes;
+std::ranges::sort(g_copy);
+std::ranges::sort(c_copy);
+
+constexpr auto N = 49;
+if (std::size(cpu_frametimes) < N) {
     return std::nullopt;
 }
 
+const auto F = std::size(g_copy);
+// close enough to median lol
+const auto g = g_copy[F / 2];
+const auto c = c_copy[F / 2];
+
+std::cout << g << '\n';
+
+std::cout << "    median gpu: " << (g / 1'000'000) << " ms " << g / 1'000
+          << " us " << g << " ns\n";
+std::cout << "    median cpu: " << c / 1'000'000 << " ms " << c / 1'000
+          << " us " << c << " ns\n";
+
+if (F > N) {
+    gpu_frametimes.pop_front();
+    cpu_frametimes.pop_front();
+}
+
+return std::nullopt;
+}
+*/
+
 } // namespace low_latency
 \ No newline at end of file
diff --git a/src/queue_context.hh b/src/queue_context.hh
index a6f43e5..3df6af4 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -10,7 +10,6 @@
 #include <chrono>
 #include <deque>
 #include <memory>
-#include <span>
 #include <unordered_set>
 
 namespace low_latency {
@@ -35,24 +34,26 @@ class QueueContext final : public Context {
     struct Submission {
         const std::unordered_set<VkSemaphore> signals;
         const std::unordered_set<VkSemaphore> waits;
-        const std::uint64_t target_semaphore_sequence;
-        const std::shared_ptr<TimestampPool::Handle> timestamp_handle;
+        const std::uint64_t sequence;
+
+        const std::shared_ptr<TimestampPool::Handle> start_handle;
+        const std::shared_ptr<TimestampPool::Handle> end_handle;
     };
     std::deque<std::shared_ptr<Submission>> submissions;
 
     // In flight frames!
     // These might come from different contexts.
     struct Frame {
-        const QueueContext& start_context;
-        const std::shared_ptr<TimestampPool::Handle> start;
-        const std::uint64_t target_start_sequence;
 
-        const QueueContext& end_context;
-        const std::shared_ptr<TimestampPool::Handle> end;
-        const std::uint64_t target_end_sequence;
+        struct Timepoint {
+            const QueueContext& context;
+            const std::shared_ptr<TimestampPool::Handle> handle;
+            const std::uint64_t sequence;
+        };
+
+        const Timepoint start;
+        const Timepoint end;
     };
-    // These can be null, it means we made presented without finding the
-    // timestamps associated with the present.
     std::deque<std::unique_ptr<Frame>> in_flight_frames;
 
   public:
@@ -61,12 +62,12 @@ class QueueContext final : public Context {
     virtual ~QueueContext();
 
   public:
-    void notify_submit(std::span<const VkSubmitInfo> infos,
-                       const std::uint64_t target_semaphore_sequence,
-                       std::shared_ptr<TimestampPool::Handle>&& handle);
-    void notify_submit(std::span<const VkSubmitInfo2> infos,
-                       const std::uint64_t target_semaphore_sequence,
-                       std::shared_ptr<TimestampPool::Handle>&& handle);
+    void
+    notify_submit(const VkSubmitInfo& info, const std::uint64_t& sequence,
+                  const std::shared_ptr<TimestampPool::Handle> head_handle,
+                  const std::shared_ptr<TimestampPool::Handle> tail_handle);
+
+    // TODO submit2
 
     void notify_present(const VkPresentInfoKHR& info);
 
diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc
index b4dc3c9..cf48873 100644
--- a/src/timestamp_pool.cc
+++ b/src/timestamp_pool.cc
@@ -3,142 +3,152 @@
 #include "queue_context.hh"
 
 #include <ranges>
+#include <vulkan/utility/vk_dispatch_table.h>
 #include <vulkan/vulkan_core.h>
 
 namespace low_latency {
 
-TimestampPool::Block TimestampPool::allocate() {
-    const auto& device_context = this->queue_context.device_context;
+TimestampPool::QueryChunk::QueryChunk(const QueueContext& queue_context) {
+    const auto& device_context = queue_context.device_context;
+    const auto& vtable = device_context.vtable;
 
-    const auto query_pool = [&]() -> VkQueryPool {
+    this->query_pool = [&]() {
         const auto qpci = VkQueryPoolCreateInfo{
             .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
             .queryType = VK_QUERY_TYPE_TIMESTAMP,
-            .queryCount = this->TIMESTAMP_QUERY_POOL_SIZE};
-
-        auto query_pool = VkQueryPool{};
+            .queryCount = QueryChunk::CHUNK_SIZE};
 
-        device_context.vtable.CreateQueryPool(device_context.device, &qpci,
-                                              nullptr, &query_pool);
-        return query_pool;
+        auto qp = VkQueryPool{};
+        vtable.CreateQueryPool(device_context.device, &qpci, nullptr, &qp);
+        return qp;
     }();
 
-    const auto key_range =
-        std::views::iota(0u, this->TIMESTAMP_QUERY_POOL_SIZE / 2) |
-        std::views::transform([](const std::uint64_t& i) { return 2 * i; });
-
-    auto available_indices = std::make_unique<available_query_indicies_t>(
-        available_query_indicies_t{std::begin(key_range), std::end(key_range)});
-
-    auto command_buffers = [&, this]() -> auto {
-        auto command_buffers =
-            std::vector<VkCommandBuffer>(this->TIMESTAMP_QUERY_POOL_SIZE);
+    constexpr auto key_range = std::views::iota(0u, QueryChunk::CHUNK_SIZE);
+    this->free_indices = std::make_unique<free_indices_t>(std::begin(key_range),
+                                                          std::end(key_range));
 
+    this->command_buffers = [&, this]() -> auto {
+        auto cbs = std::make_unique<std::vector<VkCommandBuffer>>(CHUNK_SIZE);
         const auto cbai = VkCommandBufferAllocateInfo{
             .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
-            .commandPool = this->queue_context.command_pool,
+            .commandPool = queue_context.command_pool,
             .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
-            .commandBufferCount =
-                static_cast<std::uint32_t>(std::size(command_buffers)),
+            .commandBufferCount = static_cast<std::uint32_t>(std::size(*cbs)),
         };
-        device_context.vtable.AllocateCommandBuffers(
-            device_context.device, &cbai, std::data(command_buffers));
-        std::ranges::for_each(command_buffers, [&](const auto& cb) {
-            device_context.sdld(device_context.device, cb);
-        });
-        return std::make_unique<std::vector<VkCommandBuffer>>(command_buffers);
+        vtable.AllocateCommandBuffers(device_context.device, &cbai,
+                                      std::data(*cbs));
+        return cbs;
     }();
-
-    return Block{.query_pool = query_pool,
-                 .available_indicies = std::move(available_indices),
-                 .command_buffers = std::move(command_buffers)};
 }
 
+TimestampPool::QueryChunk::~QueryChunk() {}
+
 TimestampPool::TimestampPool(QueueContext& queue_context)
     : queue_context(queue_context) {
 
-    // Allocate one block on construction, it's likely more than enough!
-    this->blocks.emplace_back(this->allocate());
+    // Allocate one block on construction, it's likely more than enough.
+    auto query_chunk = std::make_shared<QueryChunk>(this->queue_context);
+    this->query_chunks.emplace(std::move(query_chunk));
 }
 
 std::shared_ptr<TimestampPool::Handle> TimestampPool::acquire() {
-    const auto vacant_iter = [this]() -> auto {
-        const auto it =
-            std::ranges::find_if(this->blocks, [](const auto& block) {
-                return std::size(*block.available_indicies);
+
+    // Gets the empty one, or inserts a new one and returns it.
+    const auto not_empty_iter = [this]() -> auto {
+        const auto not_empty_iter =
+            std::ranges::find_if(this->query_chunks, [](const auto& qc) {
+                assert(qc);
+                return std::size(*qc->free_indices);
             });
 
-        if (it != std::end(this->blocks)) {
-            return it;
+        if (not_empty_iter != std::end(this->query_chunks)) {
+            return not_empty_iter;
         }
-        this->blocks.emplace_back(this->allocate());
-        return std::prev(std::end(this->blocks));
-    }();
-
-    const auto query_pool = vacant_iter->query_pool;
-    auto& available_indices = *vacant_iter->available_indicies;
 
-    // Grab any element from our set and erase it immediately after.
-    const auto query_index = *std::begin(available_indices);
-    available_indices.erase(std::begin(available_indices));
-
-    const auto command_buffers = [&]() -> auto {
-        auto command_buffers = std::array<VkCommandBuffer, 2>{};
-        std::ranges::copy_n(
-            std::next(std::begin(*vacant_iter->command_buffers), query_index),
-            std::size(command_buffers), std::begin(command_buffers));
-        return command_buffers;
+        const auto insert = std::make_shared<QueryChunk>(this->queue_context);
+        const auto [iter, did_insert] = this->query_chunks.emplace(insert);
+        assert(did_insert);
+        return iter;
     }();
 
-    const auto block_index = static_cast<std::size_t>(
-        std::distance(std::begin(this->blocks), vacant_iter));
+    // Grab any element from our set and erase it immediately after.
+    auto& indices = *(*not_empty_iter)->free_indices;
+    const auto query_index = *std::begin(indices);
+    assert(indices.erase(query_index));
 
-    return std::make_shared<Handle>(available_indices, block_index, query_pool,
-                                    query_index, command_buffers);
+    return std::make_shared<Handle>(*not_empty_iter, query_index);
 }
 
-TimestampPool::Handle::Handle(
-    TimestampPool::available_query_indicies_t& index_origin,
-    const std::size_t block_index, const VkQueryPool& query_pool,
-    const std::uint64_t query_index,
-    const std::array<VkCommandBuffer, 2>& command_buffers)
-    : index_origin(index_origin), block_index(block_index),
-      query_pool(query_pool), query_index(query_index),
-      command_buffers(command_buffers) {}
+TimestampPool::Handle::Handle(const std::shared_ptr<QueryChunk>& origin_chunk,
+                              const std::uint64_t& query_index)
+    : query_pool(origin_chunk->query_pool), query_index(query_index),
+      origin_chunk(origin_chunk),
+      command_buffer((*origin_chunk->command_buffers)[query_index]) {}
 
 TimestampPool::Handle::~Handle() {
-    assert(this->index_origin.insert(this->query_index).second);
+    // Parent destructing shouldn't mean we should have a bunch of insertions
+    // for zero reason.
+    if (const auto ptr = this->origin_chunk.lock(); ptr) {
+        assert(ptr->free_indices->insert(this->query_index).second);
+    }
 }
 
 void TimestampPool::Handle::setup_command_buffers(
-    const VkuDeviceDispatchTable& vtable) const {
-
-    const auto& [head, tail] = this->command_buffers;
+    const Handle& tail, const QueueContext& queue_context) const {
 
     const auto cbbi = VkCommandBufferBeginInfo{
         .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
     };
-    // Heads
-    vtable.ResetCommandBuffer(head, 0);
-    vtable.BeginCommandBuffer(head, &cbbi);
-    // Reset the next two and make them unavailable when they are run!
-    vtable.CmdResetQueryPool(head, this->query_pool, this->query_index, 2);
-    vtable.CmdWriteTimestamp2KHR(head, VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
+
+    const auto& device_context = queue_context.device_context;
+    const auto& vtable = device_context.vtable;
+
+    vtable.ResetQueryPoolEXT(device_context.device, this->query_pool,
+                             this->query_index, 1);
+
+    vtable.BeginCommandBuffer(this->command_buffer, &cbbi);
+    vtable.CmdWriteTimestamp2KHR(this->command_buffer,
+                                 VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
                                  this->query_pool, this->query_index);
-    vtable.EndCommandBuffer(head);
-
-    // Tails
-    vtable.ResetCommandBuffer(tail, 0);
-    vtable.BeginCommandBuffer(tail, &cbbi);
-    vtable.CmdWriteTimestamp2KHR(tail, VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
-                                 this->query_pool, this->query_index + 1);
-    vtable.EndCommandBuffer(tail);
+    vtable.EndCommandBuffer(this->command_buffer);
+
+    vtable.ResetQueryPoolEXT(device_context.device, tail.query_pool,
+                             tail.query_index, 1);
+    vtable.ResetCommandBuffer(tail.command_buffer, 0);
+    vtable.BeginCommandBuffer(tail.command_buffer, &cbbi);
+    vtable.CmdWriteTimestamp2KHR(tail.command_buffer,
+                                 VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
+                                 tail.query_pool, tail.query_index);
+    vtable.EndCommandBuffer(tail.command_buffer);
 }
 
-void TimestampPool::poll() {
-    this->cached_timestamps.clear();
-    this->cached_timestamps.reserve(std::size(this->blocks));
+std::optional<std::uint64_t>
+TimestampPool::Handle::get_ticks(const TimestampPool& pool) {
+
+    const auto& device_context = pool.queue_context.device_context;
+    const auto& vtable = device_context.vtable;
+
+    struct QueryResult {
+        std::uint64_t value;
+        std::uint64_t available;
+    };
+    auto query_result = QueryResult{};
+
+    const auto r = vtable.GetQueryPoolResults(
+        device_context.device, query_pool, this->query_index, 1,
+        sizeof(query_result), &query_result, sizeof(query_result),
+        VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT);
+
+    assert(r == VK_SUCCESS || r == VK_NOT_READY);
+
+    if (!query_result.available) {
+        return std::nullopt;
+    }
+    return query_result.value;
+}
 
+/*
+void TimestampPool::poll() {
     const auto& device_context = this->queue_context.device_context;
 
     std::ranges::transform(
@@ -163,26 +173,16 @@ void TimestampPool::poll() {
             return timestamps;
         });
 };
-
-std::uint64_t TimestampPool::get_polled(const Handle& handle, const bool hack) {
-
-    assert(handle.block_index < std::size(this->cached_timestamps));
-
-    const auto& cached_timestamp = this->cached_timestamps[handle.block_index];
-    assert(cached_timestamp != nullptr);
-    assert(handle.query_index < std::size(*cached_timestamp));
-
-    return (*cached_timestamp)[handle.query_index + hack];
-}
+*/
 
 TimestampPool::~TimestampPool() {
     const auto& device = this->queue_context.device_context.device;
     const auto& vtable = this->queue_context.device_context.vtable;
-    for (const auto& block : this->blocks) {
+    for (const auto& query_chunk : this->query_chunks) {
         vtable.FreeCommandBuffers(device, this->queue_context.command_pool,
-                                  std::size(*block.command_buffers),
-                                  std::data(*block.command_buffers));
-        vtable.DestroyQueryPool(device, block.query_pool, nullptr);
+                                  std::size(*query_chunk->command_buffers),
+                                  std::data(*query_chunk->command_buffers));
+        vtable.DestroyQueryPool(device, query_chunk->query_pool, nullptr);
     }
 }
 
diff --git a/src/timestamp_pool.hh b/src/timestamp_pool.hh
index a4aa429..f69b06f 100644
--- a/src/timestamp_pool.hh
+++ b/src/timestamp_pool.hh
@@ -40,6 +40,7 @@
 
 #include <memory>
 #include <unordered_set>
+#include <vector>
 
 namespace low_latency {
 
@@ -47,58 +48,62 @@ class QueueContext;
 
 class TimestampPool final {
   private:
-    static constexpr auto TIMESTAMP_QUERY_POOL_SIZE = 512u;
-    static_assert(TIMESTAMP_QUERY_POOL_SIZE % 2 == 0);
-
-  private:
     QueueContext& queue_context;
 
-    // VkQueryPool with an unordered set of keys available for reading.
-    using available_query_indicies_t = std::unordered_set<std::uint64_t>;
+    // A chunk of data which is useful for making timestamp queries.
+    // Allows association of an index to a query pool and command buffer.
+    // We reuse these when they're released.
+    struct QueryChunk final {
+      private:
+        using free_indices_t = std::unordered_set<std::uint64_t>;
+        static constexpr auto CHUNK_SIZE = 512u;
 
-    struct Block {
+      public:
         VkQueryPool query_pool;
-        std::unique_ptr<available_query_indicies_t> available_indicies;
+        std::unique_ptr<free_indices_t> free_indices;
         std::unique_ptr<std::vector<VkCommandBuffer>> command_buffers;
-    };
-    std::vector<Block> blocks; // multiple blocks
 
-    // A snapshot of all available blocks for reading after each poll.
-    std::vector<std::unique_ptr<std::vector<std::uint64_t>>> cached_timestamps;
+      public:
+        QueryChunk(const QueueContext& queue_context);
+        QueryChunk(const QueryChunk& handle) = delete;
+        QueryChunk(QueryChunk&&) = delete;
+        QueryChunk operator=(const QueryChunk& handle) = delete;
+        QueryChunk operator=(QueryChunk&&) = delete;
+        ~QueryChunk();
+    };
+    std::unordered_set<std::shared_ptr<QueryChunk>> query_chunks;
 
   public:
-    // A handle represents two std::uint64_t blocks oftimestamp memory and two
-    // command buffers.
+    // A handle represents a VkCommandBuffer and a query index.
+    // Once the Handle goes out of scope, the query index will be returned
+    // to the parent pool.
     struct Handle final {
       private:
         friend class TimestampPool;
 
       private:
-        available_query_indicies_t& index_origin;
-        const std::size_t block_index;
+        const std::weak_ptr<QueryChunk> origin_chunk;
 
       public:
         const VkQueryPool query_pool;
         const std::uint64_t query_index;
-        const std::array<VkCommandBuffer, 2> command_buffers;
+        const VkCommandBuffer command_buffer;
 
       public:
-        Handle(TimestampPool::available_query_indicies_t& index_origin,
-               const std::size_t block_index, const VkQueryPool& query_pool,
-               const std::uint64_t query_index,
-               const std::array<VkCommandBuffer, 2>& command_buffers);
+        Handle(const std::shared_ptr<QueryChunk>& origin_chunk,
+               const std::uint64_t& query_index);
         Handle(const Handle& handle) = delete;
         Handle(Handle&&) = delete;
         Handle operator=(const Handle& handle) = delete;
         Handle operator=(Handle&&) = delete;
-        ~Handle(); // frees from the pool
+        ~Handle();
 
       public:
-        void setup_command_buffers(const VkuDeviceDispatchTable& vtable) const;
-    };
+        void setup_command_buffers(const Handle& tail,
+                                   const QueueContext& queue_context) const;
 
-  private:
-    Block allocate();
+        std::optional<std::uint64_t> get_ticks(const TimestampPool& pool);
+    };
 
   public:
     TimestampPool(QueueContext& queue_context);
@@ -109,12 +114,8 @@ class TimestampPool final {
     ~TimestampPool();
 
   public:
-    // Hands out a Handle with a pool and index of two uint64_t's.
+    // Hands out a Handle!
     std::shared_ptr<Handle> acquire();
-
-    void poll(); // saves the current state for future get's.
-
-    std::uint64_t get_polled(const Handle& handle, const bool hack = false);
 };
 
 } // namespace low_latency