8 files changed, 821 insertions, 214 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 793e637..09daa4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,6 @@ add_custom_command(TARGET ${LIBRARY_NAME} POST_BUILD
 
 add_custom_command(TARGET ${LIBRARY_NAME} POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy_if_different
-        "${CMAKE_CURRENT_SOURCE_DIR}/low_latency_layer.json"
+    "${CMAKE_CURRENT_SOURCE_DIR}/low_latency_layer.json"
 	"${OUTPUT_DIR}/"
 )
diff --git a/src/latency_controller.hh b/src/latency_controller.hh
new file mode 100644
index 0000000..6672d5a
--- /dev/null
+++ b/src/latency_controller.hh
@@ -0,0 +1,15 @@
+#ifndef LATENCY_CONTROLLER_HH_
+#define LATENCY_CONTROLLER_HH_
+
+// The purpose of this file is to provide 
+
+namespace low_latency {
+    
+class LatencyController final {
+    
+
+};
+
+};
+
+#endif
+\ No newline at end of file
diff --git a/src/layer.cc b/src/layer.cc
index 24cc519..94b4969 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -1,5 +1,6 @@
 #include "layer.hh"
 
+#include <utility>
 #include <vulkan/utility/vk_dispatch_table.h>
 #include <vulkan/vk_layer.h>
 #include <vulkan/vk_platform.h>
@@ -7,30 +8,31 @@
 #include <vulkan/vulkan.hpp>
 #include <vulkan/vulkan_core.h>
 
-#include <cstring>
+#include <deque>
 #include <iostream>
 #include <mutex>
 #include <string_view>
 #include <unordered_map>
+#include <unordered_set>
+
+#include "queue_context.hh"
+#include "timestamp_pool.hh"
 
 namespace low_latency {
 
+// Global mutex for layer data.
 static auto mutex = std::mutex{};
 
-struct command_stats {
-    std::uint32_t num_draws;
-    std::uint32_t num_instances;
-    std::uint32_t num_verts;
-};
-static std::unordered_map<VkCommandBuffer, command_stats>
-    commandbuffer_to_stats{};
-static std::unordered_map<void*, VkuInstanceDispatchTable> instance_dispatch;
-static std::unordered_map<void*, VkuDeviceDispatchTable> device_dispatch;
+// Mappings for device instances.
+static std::unordered_map<VkPhysicalDevice, VkInstance> device_instances;
+static std::unordered_map<void*, VkuInstanceDispatchTable> instance_vtables;
+static std::unordered_map<void*, VkuDeviceDispatchTable> device_vtables;
+
+static std::uint64_t current_frame = 0;
+static std::unordered_map<VkQueue, QueueContext> queue_contexts;
 
 template <typename T>
 concept DispatchableType =
-    std::same_as<std::remove_cvref_t<T>, VkQueue> ||
-    std::same_as<std::remove_cvref_t<T>, VkCommandBuffer> ||
     std::same_as<std::remove_cvref_t<T>, VkInstance> ||
     std::same_as<std::remove_cvref_t<T>, VkDevice> ||
     std::same_as<std::remove_cvref_t<T>, VkPhysicalDevice>;
@@ -38,143 +40,72 @@ template <DispatchableType T> void* get_key(const T& inst) {
     return *reinterpret_cast<void**>(inst);
 }
 
-static VKAPI_ATTR VkResult VKAPI_CALL
-BeginCommandBuffer(VkCommandBuffer command_buffer,
-                   const VkCommandBufferBeginInfo* begin_info) {
-    const auto lock = std::scoped_lock{mutex};
-    commandbuffer_to_stats[command_buffer] = {};
-    return device_dispatch[get_key(command_buffer)].BeginCommandBuffer(
-        command_buffer, begin_info);
-}
-
-static VKAPI_ATTR void VKAPI_CALL CmdDraw(VkCommandBuffer command_buffer,
-                                          std::uint32_t vertex_count,
-                                          std::uint32_t instance_count,
-                                          std::uint32_t first_vertex,
-                                          std::uint32_t first_instance) {
-
-    const auto lock = std::scoped_lock{mutex};
-
-    if (const auto it = commandbuffer_to_stats.find(command_buffer);
-        it != std::end(commandbuffer_to_stats)) {
-
-        auto& stats = it->second;
-        stats.num_draws++;
-        stats.num_instances += instance_count;
-        stats.num_verts += instance_count * vertex_count;
-    }
-
-    device_dispatch[get_key(command_buffer)].CmdDraw(
-        command_buffer, vertex_count, instance_count, first_vertex,
-        first_instance);
-}
-
-static VKAPI_ATTR void VKAPI_CALL CmdDrawIndexed(VkCommandBuffer command_buffer,
-                                                 uint32_t index_count,
-                                                 uint32_t instance_count,
-                                                 uint32_t first_index,
-                                                 int32_t vertex_offset,
-                                                 uint32_t first_instance) {
-
-    const auto lock = std::scoped_lock{mutex};
-
-    if (const auto it = commandbuffer_to_stats.find(command_buffer);
-        it != std::end(commandbuffer_to_stats)) {
-
-        auto& stats = it->second;
-        stats.num_draws++;
-        stats.num_instances += instance_count;
-        stats.num_verts += instance_count * index_count;
-    }
-
-    device_dispatch[get_key(command_buffer)].CmdDrawIndexed(
-        command_buffer, index_count, instance_count, first_index, vertex_offset,
-        first_instance);
-}
-
-static VKAPI_ATTR VkResult VKAPI_CALL
-EndCommandBuffer(VkCommandBuffer command_buffer) {
-
-    const auto lock = std::scoped_lock{mutex};
+template <typename T, typename sType>
+static T* get_link_info(const void* const head, const sType& stype) {
+    for (auto i = reinterpret_cast<const VkBaseInStructure*>(head); i;
+         i = i->pNext) {
 
-    const auto& s = commandbuffer_to_stats[command_buffer];
+        if (i->sType != stype) {
+            continue;
+        }
 
-    std::cout << std::format("Command buffer ended with {} draws, {} "
-                             "instances and {} vertices\n",
-                             s.num_draws, s.num_instances, s.num_verts);
+        const auto info = reinterpret_cast<const T*>(i);
+        if (info->function != VK_LAYER_LINK_INFO) {
+            continue;
+        }
 
-    const auto it = device_dispatch.find(get_key(command_buffer));
-    if (it == std::end(device_dispatch)) {
-        return VK_ERROR_DEVICE_LOST;
+        return const_cast<T*>(info);
     }
-    return it->second.EndCommandBuffer(command_buffer);
+    return nullptr;
 }
 
 static VKAPI_ATTR VkResult VKAPI_CALL
 CreateInstance(const VkInstanceCreateInfo* pCreateInfo,
                const VkAllocationCallbacks* pAllocator, VkInstance* pInstance) {
 
-    // Iterate through list starting at pNext until we see create_info and
-    // link_info.
-    auto layer_create_info = [&]() -> VkLayerInstanceCreateInfo* {
-        for (auto base =
-                 reinterpret_cast<const VkBaseInStructure*>(pCreateInfo->pNext);
-             base; base = base->pNext) {
-
-            if (base->sType != VK_STRUCTURE_TYPE_LOADER_INSTANCE_CREATE_INFO) {
-                continue;
-            }
+    const auto link_info = get_link_info<VkLayerInstanceCreateInfo>(
+        pCreateInfo->pNext, VK_STRUCTURE_TYPE_LOADER_INSTANCE_CREATE_INFO);
 
-            const auto info =
-                reinterpret_cast<const VkLayerInstanceCreateInfo*>(base);
-            if (info->function != VK_LAYER_LINK_INFO) {
-                continue;
-            }
-            return const_cast<VkLayerInstanceCreateInfo*>(info);
-        }
-        return nullptr;
-    }();
-
-    if (!layer_create_info || !layer_create_info->u.pLayerInfo) {
+    if (!link_info || !link_info->u.pLayerInfo) {
         return VK_ERROR_INITIALIZATION_FAILED;
     }
 
     // Store our get instance proc addr function and pop it off our list +
     // advance the list so future layers know what to call.
-    const auto next_gipa =
-        layer_create_info->u.pLayerInfo->pfnNextGetInstanceProcAddr;
-    if (!next_gipa) {
+    const auto gipa = link_info->u.pLayerInfo->pfnNextGetInstanceProcAddr;
+    if (!gipa) {
         return VK_ERROR_INITIALIZATION_FAILED;
     }
-    layer_create_info->u.pLayerInfo = layer_create_info->u.pLayerInfo->pNext;
+    link_info->u.pLayerInfo = link_info->u.pLayerInfo->pNext;
 
     // Call our create instance func, and store vkDestroyInstance, and
     // vkCreateDevice as well.
-    const auto create_instance_func = reinterpret_cast<PFN_vkCreateInstance>(
-        next_gipa(VK_NULL_HANDLE, "vkCreateInstance"));
-    if (!create_instance_func) {
+    const auto create_instance = reinterpret_cast<PFN_vkCreateInstance>(
+        gipa(VK_NULL_HANDLE, "vkCreateInstance"));
+    if (!create_instance) {
         return VK_ERROR_INITIALIZATION_FAILED;
     }
 
-    if (const auto result =
-            create_instance_func(pCreateInfo, pAllocator, pInstance);
+    if (const auto result = create_instance(pCreateInfo, pAllocator, pInstance);
         result != VK_SUCCESS) {
 
         return result;
     }
 
     const auto lock = std::scoped_lock{mutex};
-    instance_dispatch.emplace(
+    instance_vtables.emplace(
         get_key(*pInstance),
         VkuInstanceDispatchTable{
             .DestroyInstance = reinterpret_cast<PFN_vkDestroyInstance>(
-                next_gipa(*pInstance, "vkDestroyInstance")),
+                gipa(*pInstance, "vkDestroyInstance")),
+            .EnumeratePhysicalDevices =
+                reinterpret_cast<PFN_vkEnumeratePhysicalDevices>(
+                    gipa(*pInstance, "vkEnumeratePhysicalDevices")),
             .GetInstanceProcAddr = reinterpret_cast<PFN_vkGetInstanceProcAddr>(
-                next_gipa(*pInstance, "vkGetInstanceProcAddr")),
+                gipa(*pInstance, "vkGetInstanceProcAddr")),
             .EnumerateDeviceExtensionProperties =
                 reinterpret_cast<PFN_vkEnumerateDeviceExtensionProperties>(
-                    next_gipa(*pInstance,
-                              "vkEnumerateDeviceExtensionProperties")),
+                    gipa(*pInstance, "vkEnumerateDeviceExtensionProperties")),
         }
 
     );
@@ -186,75 +117,195 @@ static VKAPI_ATTR void VKAPI_CALL
 DestroyInstance(VkInstance instance, const VkAllocationCallbacks* allocator) {
 
     const auto lock = std::scoped_lock{mutex};
-    instance_dispatch.erase(get_key(instance));
+
+    const auto key = get_key(instance);
+    assert(instance_vtables.contains(key));
+    instance_vtables.erase(key);
+}
+
+static VKAPI_ATTR VkResult VKAPI_CALL EnumeratePhysicalDevices(
+    VkInstance instance, std::uint32_t* count, VkPhysicalDevice* devices) {
+
+    const auto lock = std::scoped_lock{mutex};
+
+    const auto it = instance_vtables.find(get_key(instance));
+    assert(it != std::end(instance_vtables));
+    const auto& vtable = it->second;
+
+    if (const auto result =
+            vtable.EnumeratePhysicalDevices(instance, count, devices);
+        !devices || result != VK_SUCCESS) {
+
+        return result;
+    }
+
+    for (auto i = std::uint32_t{0}; i < *count; ++i) {
+        device_instances.emplace(devices[i], instance);
+    }
+
+    return VK_SUCCESS;
 }
 
 static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
     VkPhysicalDevice physical_device, const VkDeviceCreateInfo* pCreateInfo,
     const VkAllocationCallbacks* pAllocator, VkDevice* pDevice) {
 
-    auto layer_create_info = [&]() -> VkLayerDeviceCreateInfo* {
-        for (auto base =
-                 reinterpret_cast<const VkBaseInStructure*>(pCreateInfo->pNext);
-             base; base = base->pNext) {
+    const auto create_info = get_link_info<VkLayerDeviceCreateInfo>(
+        pCreateInfo->pNext, VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO);
+    if (!create_info || !create_info->u.pLayerInfo) {
+        return VK_ERROR_INITIALIZATION_FAILED;
+    }
+
+    const auto gipa = create_info->u.pLayerInfo->pfnNextGetInstanceProcAddr;
+    const auto gdpa = create_info->u.pLayerInfo->pfnNextGetDeviceProcAddr;
+    if (!gipa || !gdpa) {
+        return VK_ERROR_INITIALIZATION_FAILED;
+    }
+    create_info->u.pLayerInfo = create_info->u.pLayerInfo->pNext;
+
+    const auto lock = std::scoped_lock{mutex};
+
+    const auto next_extensions =
+        [&]() -> std::optional<std::vector<const char*>> {
+        const auto supported_extensions =
+            [&]() -> std::optional<std::vector<VkExtensionProperties>> {
+            const auto enumerate_device_extensions =
+                reinterpret_cast<PFN_vkEnumerateDeviceExtensionProperties>(
+                    gipa(device_instances[physical_device],
+                         "vkEnumerateDeviceExtensionProperties"));
+            if (!enumerate_device_extensions) {
+                return std::nullopt;
+            }
+
+            auto count = std::uint32_t{};
+            if (enumerate_device_extensions(physical_device, nullptr, &count,
+                                            nullptr) != VK_SUCCESS) {
+
+                return std::nullopt;
+            }
+
+            auto supported_extensions =
+                std::vector<VkExtensionProperties>(count);
+            if (enumerate_device_extensions(physical_device, nullptr, &count,
+                                            std::data(supported_extensions)) !=
+                VK_SUCCESS) {
+
+                return std::nullopt;
+            }
+
+            return supported_extensions;
+        }();
+
+        auto next_extensions =
+            std::vector{*pCreateInfo->ppEnabledExtensionNames,
+                        std::next(*pCreateInfo->ppEnabledExtensionNames +
+                                  pCreateInfo->enabledExtensionCount)};
+
+        const auto wanted_extensions = {
+            VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME,
+            VK_KHR_CALIBRATED_TIMESTAMPS_EXTENSION_NAME,
+            VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME};
+
+        for (const auto& wanted : wanted_extensions) {
+
+            if (std::ranges::any_of(
+                    next_extensions, [&](const auto& next_extension) {
+                        return !std::strcmp(next_extension, wanted);
+                    })) {
 
-            if (base->sType != VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO) {
-                continue;
+                continue; // Already included, ignore it.
             }
 
-            const auto info =
-                reinterpret_cast<const VkLayerDeviceCreateInfo*>(base);
+            if (std::ranges::none_of(*supported_extensions,
+                                     [&](const auto& supported_extension) {
+                                         return !std::strcmp(
+                                             supported_extension.extensionName,
+                                             wanted);
+                                     })) {
 
-            if (info->function != VK_LAYER_LINK_INFO) {
-                continue;
+                return std::nullopt; // We don't support it, the layer can't
+                                     // work.
             }
 
-            return const_cast<VkLayerDeviceCreateInfo*>(info);
+            next_extensions.push_back(wanted);
         }
-        return nullptr;
+
+        return next_extensions;
     }();
 
-    if (!layer_create_info || !layer_create_info->u.pLayerInfo) {
+    if (!next_extensions.has_value()) {
         return VK_ERROR_INITIALIZATION_FAILED;
     }
 
-    const auto next_gipa =
-        layer_create_info->u.pLayerInfo->pfnNextGetInstanceProcAddr;
-    const auto next_gdpa =
-        layer_create_info->u.pLayerInfo->pfnNextGetDeviceProcAddr;
-    if (!next_gipa || !next_gdpa) {
+    const auto create_device = reinterpret_cast<PFN_vkCreateDevice>(
+        gipa(VK_NULL_HANDLE, "vkCreateDevice"));
+    if (!create_device) {
         return VK_ERROR_INITIALIZATION_FAILED;
     }
-    layer_create_info->u.pLayerInfo = layer_create_info->u.pLayerInfo->pNext;
 
-    const auto create_func = reinterpret_cast<PFN_vkCreateDevice>(
-        next_gipa(VK_NULL_HANDLE, "vkCreateDevice"));
-    if (!create_func) {
-        return VK_ERROR_INITIALIZATION_FAILED;
-    }
+    const auto next_create_info = [&]() -> VkDeviceCreateInfo {
+        auto next_pCreateInfo = *pCreateInfo;
+        next_pCreateInfo.ppEnabledExtensionNames = std::data(*next_extensions);
+        next_pCreateInfo.enabledExtensionCount = std::size(*next_extensions);
+        return next_pCreateInfo;
+    }();
 
-    if (const auto result =
-            create_func(physical_device, pCreateInfo, pAllocator, pDevice);
+    if (const auto result = create_device(physical_device, &next_create_info,
+                                          pAllocator, pDevice);
         result != VK_SUCCESS) {
+
         return result;
     }
 
-    const auto lock = std::scoped_lock{mutex};
-    device_dispatch.emplace(
+    device_vtables.emplace(
         get_key(*pDevice),
         VkuDeviceDispatchTable{
             .GetDeviceProcAddr = reinterpret_cast<PFN_vkGetDeviceProcAddr>(
-                next_gdpa(*pDevice, "vkGetDeviceProcAddr")),
+                gdpa(*pDevice, "vkGetDeviceProcAddr")),
             .DestroyDevice = reinterpret_cast<PFN_vkDestroyDevice>(
-                next_gdpa(*pDevice, "vkDestroyDevice")),
+                gdpa(*pDevice, "vkDestroyDevice")),
+            .GetDeviceQueue = reinterpret_cast<PFN_vkGetDeviceQueue>(
+                gdpa(*pDevice, "vkGetDeviceQueue")),
+            .QueueSubmit = reinterpret_cast<PFN_vkQueueSubmit>(
+                gdpa(*pDevice, "vkQueueSubmit")),
+            .CreateSemaphore = reinterpret_cast<PFN_vkCreateSemaphore>(
+                gdpa(*pDevice, "vkCreateSemaphore")),
+            .CreateQueryPool = reinterpret_cast<PFN_vkCreateQueryPool>(
+                gdpa(*pDevice, "vkCreateQueryPool")),
+            .GetQueryPoolResults = reinterpret_cast<PFN_vkGetQueryPoolResults>(
+                gdpa(*pDevice, "vkGetQueryPoolResults")),
+            .CreateCommandPool = reinterpret_cast<PFN_vkCreateCommandPool>(
+                gdpa(*pDevice, "vkCreateCommandPool")),
+            .AllocateCommandBuffers =
+                reinterpret_cast<PFN_vkAllocateCommandBuffers>(
+                    gdpa(*pDevice, "vkAllocateCommandBuffers")),
             .BeginCommandBuffer = reinterpret_cast<PFN_vkBeginCommandBuffer>(
-                next_gdpa(*pDevice, "vkBeginCommandBuffer")),
+                gdpa(*pDevice, "vkBeginCommandBuffer")),
             .EndCommandBuffer = reinterpret_cast<PFN_vkEndCommandBuffer>(
-                next_gdpa(*pDevice, "vkEndCommandBuffer")),
-            .CmdDraw = reinterpret_cast<PFN_vkCmdDraw>(
-                next_gdpa(*pDevice, "vkCmdDraw")),
+                gdpa(*pDevice, "vkEndCommandBuffer")),
+            .ResetCommandBuffer = reinterpret_cast<PFN_vkResetCommandBuffer>(
+                gdpa(*pDevice, "vkResetCommandBuffer")),
+            .CmdDraw =
+                reinterpret_cast<PFN_vkCmdDraw>(gdpa(*pDevice, "vkCmdDraw")),
             .CmdDrawIndexed = reinterpret_cast<PFN_vkCmdDrawIndexed>(
-                next_gdpa(*pDevice, "vkCmdDrawIndexed")),
+                gdpa(*pDevice, "vkCmdDrawIndexed")),
+            .CmdResetQueryPool = reinterpret_cast<PFN_vkCmdResetQueryPool>(
+                gdpa(*pDevice, "vkCmdResetQueryPool")),
+            .GetDeviceQueue2 = reinterpret_cast<PFN_vkGetDeviceQueue2>(
+                gdpa(*pDevice, "vkGetDeviceQueue2")),
+            .QueueSubmit2 = reinterpret_cast<PFN_vkQueueSubmit2>(
+                gdpa(*pDevice, "vkQueueSubmit2")),
+            .QueuePresentKHR = reinterpret_cast<PFN_vkQueuePresentKHR>(
+                gdpa(*pDevice, "vkQueuePresentKHR")),
+            .GetSemaphoreCounterValueKHR =
+                reinterpret_cast<PFN_vkGetSemaphoreCounterValueKHR>(
+                    gdpa(*pDevice, "vkGetSemaphoreCounterValueKHR")),
+            .CmdWriteTimestamp2KHR =
+                reinterpret_cast<PFN_vkCmdWriteTimestamp2KHR>(
+                    gdpa(*pDevice, "vkCmdWriteTimestamp2KHR")),
+            .QueueSubmit2KHR = reinterpret_cast<PFN_vkQueueSubmit2KHR>(
+                gdpa(*pDevice, "vkQueueSubmit2KHR")),
+
         });
 
     return VK_SUCCESS;
@@ -264,68 +315,225 @@ static VKAPI_ATTR void VKAPI_CALL
 DestroyDevice(VkDevice device, const VkAllocationCallbacks* allocator) {
 
     const auto lock = std::scoped_lock{mutex};
-    device_dispatch.erase(get_key(device));
+    const auto key = get_key(device);
+    assert(device_vtables.contains(key));
+    device_vtables.erase(key);
 }
 
-// These are wrong, the tutorial isn't correct afaik.
-static VKAPI_ATTR VkResult VKAPI_CALL EnumerateInstanceLayerProperties(
-    std::uint32_t* pPropertyCount, VkLayerProperties* pProperties) {
+// Small amount of duplication, we can't assume gdq2 is available apparently.
+static VKAPI_ATTR void VKAPI_CALL
+GetDeviceQueue(VkDevice device, std::uint32_t queue_family_index,
+               std::uint32_t queue_index, VkQueue* queue) {
+
+    const auto lock = std::scoped_lock{mutex};
+    const auto& vtable = device_vtables[get_key(device)];
+
+    vtable.GetDeviceQueue(device, queue_family_index, queue_index, queue);
+    if (!queue || !*queue) {
+        return;
+    }
 
-    if (pPropertyCount) {
-        *pPropertyCount = 1;
+    if (!queue_contexts.contains(*queue)) {
+        queue_contexts.emplace(
+            std::piecewise_construct, std::forward_as_tuple(*queue),
+            std::forward_as_tuple(device, *queue, queue_family_index, vtable));
     }
+}
+
+static VKAPI_ATTR void VKAPI_CALL GetDeviceQueue2(
+    VkDevice device, const VkDeviceQueueInfo2* info, VkQueue* queue) {
+
+    const auto lock = std::scoped_lock{mutex};
+    const auto& vtable = device_vtables[get_key(device)];
 
-    if (pProperties) {
-        std::strcpy(pProperties->layerName, LAYER_NAME);
-        std::strcpy(pProperties->description, "Low Latency Layer");
-        pProperties->implementationVersion = 1;
-        pProperties->specVersion = VK_API_VERSION_1_3;
+    vtable.GetDeviceQueue2(device, info, queue);
+    if (!queue || !*queue) {
+        return;
     }
 
-    return VK_SUCCESS;
+    if (!queue_contexts.contains(*queue)) {
+        queue_contexts.emplace(
+            std::piecewise_construct, std::forward_as_tuple(*queue),
+            std::forward_as_tuple(device, *queue, info->queueFamilyIndex,
+                                  vtable));
+    }
 }
 
-static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceLayerProperties(
-    VkPhysicalDevice physical_device, uint32_t* pPropertyCount,
-    VkLayerProperties* pProperties) {
+static VKAPI_ATTR VkResult VKAPI_CALL
+vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
+              const VkSubmitInfo* submit_info, VkFence fence) {
+
+    const auto lock = std::scoped_lock{mutex};
+
+    auto& queue_context = [&]() -> auto& {
+        const auto& queue_context_it = queue_contexts.find(queue);
+        assert(queue_context_it != std::end(queue_contexts));
+        return queue_context_it->second;
+    }();
+    const auto& vtable = device_vtables[get_key(queue_context.device)];
+
+    if (!submit_count) { // no-op submit we shouldn't worry about
+        return vtable.QueueSubmit(queue, submit_count, submit_info, fence);
+    }
+
+    // Create a new vector of submit infos, copy their existing ones.
+    auto next_submit_infos = std::vector<VkSubmitInfo>{};
+    next_submit_infos.reserve(submit_count + 2);
+
+    auto timestamp_handle = queue_context.timestamp_pool.acquire();
+    timestamp_handle->setup_command_buffers(vtable);
+
+    const auto& [head_cb, tail_cb] = timestamp_handle->command_buffers;
+
+    // The first submit info we use will steal their wait semaphores.
+    next_submit_infos.push_back(VkSubmitInfo{
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .pNext = submit_info->pNext,
+        .waitSemaphoreCount = submit_info[0].waitSemaphoreCount,
+        .pWaitSemaphores = submit_info[0].pWaitSemaphores,
+        .pWaitDstStageMask = submit_info[0].pWaitDstStageMask,
+        .commandBufferCount = 1,
+        .pCommandBuffers = &head_cb,
+    });
+
+    // Fill in original submit infos but erase the wait semaphores on the
+    // first because we stole them earlier.
+    std::ranges::copy_n(submit_info, submit_count,
+                        std::back_inserter(next_submit_infos));
+    next_submit_infos[1].pWaitSemaphores = nullptr;
+    next_submit_infos[1].waitSemaphoreCount = 0u;
+
+    const auto TODO_next = std::uint64_t{current_frame + 1};
+    const auto tail_tssi = VkTimelineSemaphoreSubmitInfo{
+        .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR,
+        .signalSemaphoreValueCount = 1,
+        .pSignalSemaphoreValues = &TODO_next,
+    };
+    next_submit_infos.push_back(VkSubmitInfo{
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .pNext = &tail_tssi,
+        .commandBufferCount = 1,
+        .pCommandBuffers = &tail_cb,
+        .signalSemaphoreCount = 1,
+        .pSignalSemaphores = &queue_context.semaphore,
+    });
+
+    if (const auto res =
+            vtable.QueueSubmit(queue, std::size(next_submit_infos),
+                               std::data(next_submit_infos), fence);
+        res != VK_SUCCESS) {
+
+        return res;
+    }
 
-    return EnumerateInstanceLayerProperties(pPropertyCount, pProperties);
+    return VK_SUCCESS;
 }
 
-static VKAPI_ATTR VkResult VKAPI_CALL EnumerateInstanceExtensionProperties(
-    const char* pLayerName, uint32_t* pPropertyCount,
-    VkExtensionProperties* pProperties) {
+// The logic for this function is identical to vkSubmitInfo.
+static VKAPI_ATTR VkResult VKAPI_CALL
+vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
+               const VkSubmitInfo2* submit_infos, VkFence fence) {
 
-    if (!pLayerName || std::string_view{pLayerName} != LAYER_NAME) {
+    const auto lock = std::scoped_lock{mutex};
+    auto& queue_context = [&]() -> auto& {
+        const auto& queue_context_it = queue_contexts.find(queue);
+        assert(queue_context_it != std::end(queue_contexts));
+        return queue_context_it->second;
+    }();
+    const auto& vtable = device_vtables[get_key(queue_context.device)];
 
-        return VK_ERROR_LAYER_NOT_PRESENT;
+    if (!submit_count) { // another no-op submit
+        return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
     }
 
-    if (pPropertyCount) {
-        *pPropertyCount = 0;
+    auto next_submit_infos = std::vector<VkSubmitInfo2>();
+    next_submit_infos.reserve(submit_count + 2);
+
+    auto timestamp_handle = queue_context.timestamp_pool.acquire();
+    timestamp_handle->setup_command_buffers(vtable);
+    const auto& [head_cb, tail_cb] = timestamp_handle->command_buffers;
+
+    const auto head_cb_info = VkCommandBufferSubmitInfo{
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+        .commandBuffer = head_cb,
+    };
+    next_submit_infos.push_back(VkSubmitInfo2{
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
+        .waitSemaphoreInfoCount = submit_infos[0].waitSemaphoreInfoCount,
+        .pWaitSemaphoreInfos = submit_infos[0].pWaitSemaphoreInfos,
+        .commandBufferInfoCount = 1,
+        .pCommandBufferInfos = &head_cb_info,
+    });
+    std::ranges::copy_n(submit_infos, submit_count,
+                        std::back_inserter(next_submit_infos));
+    next_submit_infos[1].pWaitSemaphoreInfos = nullptr;
+    next_submit_infos[1].waitSemaphoreInfoCount = 0;
+
+    const auto tail_cb_info = VkCommandBufferSubmitInfo{
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+        .commandBuffer = tail_cb,
+    };
+    next_submit_infos.push_back(VkSubmitInfo2{
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
+        .waitSemaphoreInfoCount = submit_infos[0].waitSemaphoreInfoCount,
+        .pWaitSemaphoreInfos = submit_infos[0].pWaitSemaphoreInfos,
+        .commandBufferInfoCount = 1,
+        .pCommandBufferInfos = &tail_cb_info,
+    });
+
+    if (const auto res =
+            vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
+        res != VK_SUCCESS) {
+        return res;
     }
+
     return VK_SUCCESS;
 }
 
-static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceExtensionProperties(
-    VkPhysicalDevice physical_device, const char* pLayerName,
-    uint32_t* pPropertyCount, VkExtensionProperties* pProperties) {
+static VKAPI_ATTR VkResult VKAPI_CALL
+vkQueueSubmit2KHR(VkQueue queue, std::uint32_t submit_count,
+                  const VkSubmitInfo2* submit_info, VkFence fence) {
+    // Just forward to low_latency::vkQueueSubmit2 here.
+    return low_latency::vkQueueSubmit2(queue, submit_count, submit_info, fence);
+}
 
-    if (!pLayerName || std::string_view{pLayerName} != LAYER_NAME) {
+static VKAPI_ATTR VkResult VKAPI_CALL
+vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) {
 
-        if (physical_device == VK_NULL_HANDLE) {
-            return VK_SUCCESS;
-        }
+    const auto lock = std::scoped_lock{mutex};
+    auto& queue_context = [&]() -> auto& {
+        const auto& queue_context_it = queue_contexts.find(queue);
+        assert(queue_context_it != std::end(queue_contexts));
+        return queue_context_it->second;
+    }();
+    const auto& vtable = device_vtables[get_key(queue_context.device)];
 
-        const auto lock = std::scoped_lock{mutex};
-        return instance_dispatch[get_key(physical_device)]
-            .EnumerateDeviceExtensionProperties(physical_device, pLayerName,
-                                                pPropertyCount, pProperties);
+    if (const auto res = vtable.QueuePresentKHR(queue, present_info);
+        res != VK_SUCCESS) {
+
+        return res;
     }
 
-    if (pPropertyCount) {
-        *pPropertyCount = 0;
+    std::cout << "queuePresentKHR called for queue " << queue << '\n';
+
+    // Update all of our information about this queue's timestamp pool!
+    queue_context.timestamp_pool.poll();
+
+    // While we might be submitting on this queue, let's see what our timeline
+    // semaphore says we're at.
+    uint64_t value = 0;
+    if (const auto res = vtable.GetSemaphoreCounterValueKHR(
+            queue_context.device, queue_context.semaphore, &value);
+        res != VK_SUCCESS) {
+
+        return res;
     }
+
+    std::cout << "    frame_index: " << current_frame << '\n';
+    std::cout << "    semaphore: " << value << '\n';
+    std::cout << "    queue: " << queue << '\n';
+
+    ++current_frame;
     return VK_SUCCESS;
 }
 
@@ -336,17 +544,14 @@ static const auto instance_functions =
         {"vkGetInstanceProcAddr",
          reinterpret_cast<PFN_vkVoidFunction>(LowLatency_GetInstanceProcAddr)},
 
-        {"vkEnumerateInstanceLayerProperties",
-         reinterpret_cast<PFN_vkVoidFunction>(
-             low_latency::EnumerateInstanceLayerProperties)},
-        {"vkEnumerateInstanceExtensionProperties",
-         reinterpret_cast<PFN_vkVoidFunction>(
-             low_latency::EnumerateInstanceExtensionProperties)},
-
         {"vkCreateInstance",
          reinterpret_cast<PFN_vkVoidFunction>(low_latency::CreateInstance)},
         {"vkDestroyInstance",
          reinterpret_cast<PFN_vkVoidFunction>(low_latency::DestroyInstance)},
+
+        {"vkEnumeratePhysicalDevices",
+         reinterpret_cast<PFN_vkVoidFunction>(
+             low_latency::EnumeratePhysicalDevices)},
     };
 
 static const auto device_functions =
@@ -354,27 +559,23 @@ static const auto device_functions =
         {"vkGetDeviceProcAddr",
          reinterpret_cast<PFN_vkVoidFunction>(LowLatency_GetDeviceProcAddr)},
 
-        {"vkEnumerateDeviceLayerProperties",
-         reinterpret_cast<PFN_vkVoidFunction>(
-             low_latency::EnumerateDeviceLayerProperties)},
-        {"vkEnumerateDeviceExtensionProperties",
-         reinterpret_cast<PFN_vkVoidFunction>(
-             low_latency::EnumerateDeviceExtensionProperties)},
-
         {"vkCreateDevice",
          reinterpret_cast<PFN_vkVoidFunction>(low_latency::CreateDevice)},
         {"vkDestroyDevice",
          reinterpret_cast<PFN_vkVoidFunction>(low_latency::DestroyDevice)},
 
-        {"vkCmdDraw",
-         reinterpret_cast<PFN_vkVoidFunction>(low_latency::CmdDraw)},
-        {"vkCmdDrawIndexed",
-         reinterpret_cast<PFN_vkVoidFunction>(low_latency::CmdDrawIndexed)},
+        {"vkGetDeviceQueue",
+         reinterpret_cast<PFN_vkVoidFunction>(low_latency::GetDeviceQueue)},
+        {"vkGetDeviceQueue2",
+         reinterpret_cast<PFN_vkVoidFunction>(low_latency::GetDeviceQueue2)},
 
-        {"vkBeginCommandBuffer",
-         reinterpret_cast<PFN_vkVoidFunction>(low_latency::BeginCommandBuffer)},
-        {"vkEndCommandBuffer",
-         reinterpret_cast<PFN_vkVoidFunction>(low_latency::EndCommandBuffer)},
+        {"vkQueueSubmit",
+         reinterpret_cast<PFN_vkVoidFunction>(low_latency::vkQueueSubmit)},
+        {"vkQueueSubmit2",
+         reinterpret_cast<PFN_vkVoidFunction>(low_latency::vkQueueSubmit2)},
+
+        {"vkQueuePresentKHR",
+         reinterpret_cast<PFN_vkVoidFunction>(low_latency::vkQueuePresentKHR)},
     };
 
 VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
@@ -387,7 +588,7 @@ LowLatency_GetDeviceProcAddr(VkDevice device, const char* const pName) {
     }
 
     const auto lock = std::scoped_lock{low_latency::mutex};
-    return low_latency::device_dispatch[low_latency::get_key(device)]
+    return low_latency::device_vtables[low_latency::get_key(device)]
         .GetDeviceProcAddr(device, pName);
 }
 
@@ -395,14 +596,13 @@ VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
 LowLatency_GetInstanceProcAddr(VkInstance instance, const char* const pName) {
 
     for (const auto& functions : {device_functions, instance_functions}) {
-        const auto it = functions.find(pName);
-        if (it == std::end(functions)) {
-            continue;
+
+        if (const auto it = functions.find(pName); it != std::end(functions)) {
+            return it->second;
         }
-        return it->second;
     }
 
     const auto lock = std::scoped_lock{low_latency::mutex};
-    return low_latency::instance_dispatch[low_latency::get_key(instance)]
+    return low_latency::instance_vtables[low_latency::get_key(instance)]
         .GetInstanceProcAddr(instance, pName);
 }
 \ No newline at end of file
diff --git a/src/layer.hh b/src/layer.hh
index 5633c63..08152f2 100644
--- a/src/layer.hh
+++ b/src/layer.hh
@@ -4,6 +4,8 @@
 #include <vulkan/vk_platform.h>
 #include <vulkan/vulkan.hpp>
 
+// The purpose of this file is to expose a header entry point for our layer.
+
 extern "C" {
 
 VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
diff --git a/src/queue_context.cc b/src/queue_context.cc
new file mode 100644
index 0000000..dbae4c0
--- /dev/null
+++ b/src/queue_context.cc
@@ -0,0 +1,51 @@
+#include "queue_context.hh"
+
+namespace low_latency {
+
+static VkCommandPool make_command_pool(const VkDevice& device,
+                                       const std::uint32_t& queue_family_index,
+                                       const VkuDeviceDispatchTable& vtable) {
+
+    const auto cpci = VkCommandPoolCreateInfo{
+        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+        .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
+                 VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+        .queueFamilyIndex = queue_family_index,
+    };
+
+    auto command_pool = VkCommandPool{};
+    vtable.CreateCommandPool(device, &cpci, nullptr, &command_pool);
+    return command_pool;
+}
+
+QueueContext::QueueContext(const VkDevice& device, const VkQueue queue,
+                           const std::uint32_t& queue_family_index,
+                           const VkuDeviceDispatchTable& vtable)
+    : device(device), queue(queue), queue_family_index(queue_family_index),
+      vtable(vtable),
+      // Important we make the command pool before the timestamp pool, because it's a dependency.
+      command_pool(make_command_pool(device, queue_family_index, vtable)),
+      timestamp_pool(device, vtable, command_pool) {
+
+    this->semaphore = [&]() -> VkSemaphore {
+        const auto stci = VkSemaphoreTypeCreateInfo{
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
+            .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE,
+            .initialValue = 0,
+        };
+
+        const auto sci = VkSemaphoreCreateInfo{
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+            .pNext = &stci,
+        };
+
+        auto semaphore = VkSemaphore{};
+        vtable.CreateSemaphore(device, &sci, nullptr, &semaphore);
+        return semaphore;
+    }();
+}
+
+QueueContext::~QueueContext() {
+}
+
+} // namespace low_latency
+\ No newline at end of file
diff --git a/src/queue_context.hh b/src/queue_context.hh
new file mode 100644
index 0000000..eb3f2ea
--- /dev/null
+++ b/src/queue_context.hh
@@ -0,0 +1,44 @@
+#ifndef QUEUE_STATE_HH_
+#define QUEUE_STATE_HH_
+
+#include "timestamp_pool.hh"
+
+#include <vulkan/utility/vk_dispatch_table.h>
+#include <vulkan/vulkan.hpp>
+
+#include <deque>
+#include <vector>
+
+namespace low_latency {
+
+class QueueContext final {
+  public:
+    VkDevice device;
+    VkuDeviceDispatchTable vtable;
+
+    VkQueue queue;
+    std::uint32_t queue_family_index;
+
+    VkSemaphore semaphore;
+    VkCommandPool command_pool;
+
+    TimestampPool timestamp_pool;
+
+    std::deque<
+        std::vector<std::pair<TimestampPool::Handle, TimestampPool::Handle>>>
+        tracked_queues;
+
+  public:
+    QueueContext(const VkDevice& device, const VkQueue queue,
+               const std::uint32_t& queue_family_index,
+               const VkuDeviceDispatchTable& vtable);
+    QueueContext(const QueueContext&) = delete;
+    QueueContext(QueueContext&&) = delete;
+    QueueContext operator==(const QueueContext&) = delete;
+    QueueContext operator==(QueueContext&&) = delete;
+    ~QueueContext();
+};
+
+}; // namespace low_latency
+
+#endif
+\ No newline at end of file
diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc
new file mode 100644
index 0000000..1dc37b2
--- /dev/null
+++ b/src/timestamp_pool.cc
@@ -0,0 +1,172 @@
+#include "timestamp_pool.hh"
+
+#include <ranges>
+#include <vulkan/vulkan_core.h>
+
+namespace low_latency {
+
+TimestampPool::block TimestampPool::allocate() {
+    const auto query_pool = [&]() -> VkQueryPool {
+        const auto qpci = VkQueryPoolCreateInfo{
+            .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+            .queryType = VK_QUERY_TYPE_TIMESTAMP,
+            .queryCount = this->TIMESTAMP_QUERY_POOL_SIZE};
+
+        auto query_pool = VkQueryPool{};
+        vtable.CreateQueryPool(device, &qpci, nullptr, &query_pool);
+        return query_pool;
+    }();
+
+    const auto key_range =
+        std::views::iota(0u, this->TIMESTAMP_QUERY_POOL_SIZE / 2) |
+        std::views::transform([](const std::uint64_t& i) { return 2 * i; });
+
+    const auto available_keys = std::make_shared<available_query_indicies_t>(
+        available_query_indicies_t{std::begin(key_range), std::end(key_range)});
+
+    auto command_buffers = [this]() -> auto {
+        auto command_buffers =
+            std::vector<VkCommandBuffer>(this->TIMESTAMP_QUERY_POOL_SIZE);
+
+        const auto cbai = VkCommandBufferAllocateInfo{
+            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+            .commandPool = this->command_pool,
+            .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+            .commandBufferCount =
+                static_cast<std::uint32_t>(std::size(command_buffers)),
+        };
+        vtable.AllocateCommandBuffers(device, &cbai,
+                                      std::data(command_buffers));
+        return std::make_unique<std::vector<VkCommandBuffer>>(command_buffers);
+    }();
+
+    return block{.query_pool = query_pool,
+                 .available_indicies = available_keys,
+                 .command_buffers = std::move(command_buffers)};
+}
+
+TimestampPool::TimestampPool(const VkDevice& device,
+                             const VkuDeviceDispatchTable& vtable,
+                             const VkCommandPool& command_pool)
+    : device(device), vtable(vtable), command_pool(command_pool) {
+
+    // Allocate one block on construction, it's likely more than enough!
+    this->blocks.emplace_back(this->allocate());
+}
+
+std::unique_ptr<TimestampPool::Handle> TimestampPool::acquire() {
+    const auto& vacant_iter = [this]() -> auto {
+        const auto it =
+            std::ranges::find_if(this->blocks, [](const auto& block) {
+                return std::size(*block.available_indicies);
+            });
+
+        if (it != std::end(this->blocks)) {
+            return it;
+        }
+        this->blocks.emplace_back(this->allocate());
+        return std::prev(std::end(this->blocks));
+    }();
+
+    const auto query_pool = vacant_iter->query_pool;
+    auto& available_indices = vacant_iter->available_indicies;
+
+    // Grab any element from our set and erase it immediately after.
+    const auto query_index = *std::begin(*available_indices);
+    available_indices->erase(std::begin(*available_indices));
+
+    const auto command_buffers = [&]() -> auto {
+        auto command_buffers = std::array<VkCommandBuffer, 2>{};
+        std::ranges::copy_n(
+            std::next(std::begin(*vacant_iter->command_buffers), query_index),
+            std::size(command_buffers), std::begin(command_buffers));
+        return command_buffers;
+    }();
+
+    const auto block_index = static_cast<std::size_t>(
+        std::distance(std::begin(this->blocks), vacant_iter));
+
+    return std::make_unique<Handle>(available_indices, block_index, query_pool,
+                                    query_index, command_buffers);
+}
+
+TimestampPool::Handle::Handle(
+    const std::weak_ptr<TimestampPool::available_query_indicies_t>&
+        index_origin,
+    const std::size_t block_index, const VkQueryPool& query_pool,
+    const std::uint64_t query_index,
+    const std::array<VkCommandBuffer, 2>& command_buffers)
+    : index_origin(index_origin), block_index(block_index),
+      query_pool(query_pool), query_index(query_index),
+      command_buffers(command_buffers) {}
+
+TimestampPool::Handle::~Handle() {
+    if (const auto origin = this->index_origin.lock(); origin) {
+        assert(!origin->contains(this->query_index));
+        origin->insert(this->query_index);
+    }
+}
+
+void TimestampPool::Handle::setup_command_buffers(
+    const VkuDeviceDispatchTable& vtable) const {
+
+    const auto& [head, tail] = this->command_buffers;
+
+    const auto cbbi = VkCommandBufferBeginInfo{
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+    };
+    // Heads
+    vtable.ResetCommandBuffer(head, 0);
+    vtable.BeginCommandBuffer(head, &cbbi);
+    // Reset the next two and make them unavailable when they are run!
+    vtable.CmdResetQueryPool(head, this->query_pool, this->query_index, 2);
+    vtable.CmdWriteTimestamp2KHR(head, VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
+                                 this->query_pool, this->query_index);
+    vtable.EndCommandBuffer(head);
+
+    // Tails
+    vtable.ResetCommandBuffer(tail, 0);
+    vtable.BeginCommandBuffer(tail, &cbbi);
+    vtable.CmdWriteTimestamp2KHR(tail, VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
+                                 this->query_pool, this->query_index + 1);
+    vtable.EndCommandBuffer(tail);
+}
+
+void TimestampPool::poll() {
+    this->cached_timestamps.clear();
+    this->cached_timestamps.reserve(std::size(this->blocks));
+
+    std::ranges::transform(
+        this->blocks, std::back_inserter(this->cached_timestamps),
+        [&, this](const auto& block) {
+            const auto& query_pool = block.query_pool;
+
+            auto timestamps = std::make_unique<std::vector<std::uint64_t>>(
+                this->TIMESTAMP_QUERY_POOL_SIZE);
+
+            const auto result = vtable.GetQueryPoolResults(
+                this->device, query_pool, 0, this->TIMESTAMP_QUERY_POOL_SIZE,
+                this->TIMESTAMP_QUERY_POOL_SIZE * sizeof(std::uint64_t),
+                std::data(*timestamps), sizeof(uint64_t),
+                VK_QUERY_RESULT_64_BIT);
+
+            // Might return not ready when any of them aren't ready, which is
+            // not an error for our use case.
+            assert(result == VK_SUCCESS || result == VK_NOT_READY);
+
+            return timestamps;
+        });
+};
+
+std::uint64_t TimestampPool::get_polled(const Handle& handle) {
+
+    assert(handle.block_index < std::size(this->cached_timestamps));
+
+    const auto& cached_timestamp = this->cached_timestamps[handle.block_index];
+    assert(cached_timestamp != nullptr);
+    assert(std::size(*cached_timestamp) < handle.query_index);
+
+    return handle.query_index;
+}
+
+} // namespace low_latency
+\ No newline at end of file
diff --git a/src/timestamp_pool.hh b/src/timestamp_pool.hh
new file mode 100644
index 0000000..7efa4ee
--- /dev/null
+++ b/src/timestamp_pool.hh
@@ -0,0 +1,123 @@
+#ifndef TIMESTAMP_POOL_HH_
+#define TIMESTAMP_POOL_HH_
+
+// The purpose of this file is to provide the definition of a 'timestamp pool'.
+// It manages blocks of timestamp query pools, hands them out when requested,
+// and allocates more when (if) we run out. It also efficiently reads them back.
+// This class solves some key issues:
+//
+// 1. We need a potentially infinite amount of timestamps available to the
+// GPU. While I imagine most (good) applications will limit the amount of
+// times they call vkQueueSubmit, there's no bound we can place on the
+// amount of times this function will be called. Also,
+// the amount of frames in flight might vary, so really we need
+// num_queue_submits * max_frames_in_flight timestamps. Obviously, we don't
+// know what these numbers are at runtime and can't assume that they are
+// reasonable or even constant either. We solve this by allocating more
+// timestamps when necessary.
+
+// 2. We don't want to hammer vulkan with expensive timestamp read
+// operations. If we have hundreds of query pools lying around, reading them
+// back will take hundreds of individual vulkan calls. They
+// should be batched as to perform as few reads as possible. So if we allocate
+// multiple big query pool strips, then reading them will only require that many
+// calls. We then can cache off the result of reading as well so iterating
+// through later doesn't require any vulkan interaction at all.
+//
+//
+// Usage:
+//     1. Get handle with .acquire().
+//     2. Write start/end timestamp operations with the handle's pool and index
+//     into the provided command buffer.
+//     3. With the command buffer signalled completion via some semaphore /
+//     fence, call .poll(). This will cache off all outstanding handles.
+//     Retrieving with handles which have not been signalled are undefined.
+//     4. Retrieve timestamp results with .get_polled(your_handle).
+//     5. Destruct the handle to return the key to the pool.
+
+#include <vulkan/utility/vk_dispatch_table.h>
+#include <vulkan/vulkan.hpp>
+
+#include <memory>
+#include <unordered_set>
+
+namespace low_latency {
+
+class TimestampPool final {
+  private:
+    static constexpr auto TIMESTAMP_QUERY_POOL_SIZE = 512u;
+    static_assert(TIMESTAMP_QUERY_POOL_SIZE % 2 == 0);
+
+  private:
+    VkuDeviceDispatchTable vtable;
+    VkDevice device;
+    VkCommandPool command_pool;
+
+    // VkQueryPool with an unordered set of keys available for reading.
+    using available_query_indicies_t = std::unordered_set<std::uint64_t>;
+
+    struct block {
+        VkQueryPool query_pool;
+        std::shared_ptr<available_query_indicies_t> available_indicies;
+        std::unique_ptr<std::vector<VkCommandBuffer>> command_buffers;
+    };
+    std::vector<block> blocks; // multiple blocks
+
+    // A snapshot of all available blocks for reading after each poll.
+    std::vector<std::unique_ptr<std::vector<std::uint64_t>>> cached_timestamps;
+
+  public:
+    // A handle represents two std::uint64_t blocks of timestamp memory and two
+    // command buffers.
+    struct Handle {
+      private:
+        friend class TimestampPool;
+
+      private:
+        std::weak_ptr<available_query_indicies_t> index_origin;
+        std::size_t block_index;
+
+      public:
+        VkQueryPool query_pool;
+        std::uint64_t query_index;
+        std::array<VkCommandBuffer, 2> command_buffers;
+
+      public:
+        Handle(const std::weak_ptr<TimestampPool::available_query_indicies_t>&
+                   index_origin,
+               const std::size_t block_index, const VkQueryPool& query_pool,
+               const std::uint64_t query_index,
+               const std::array<VkCommandBuffer, 2>& command_buffers);
+        Handle(const Handle& handle) = delete;
+        Handle(Handle&&) = delete;
+        Handle operator==(const Handle& handle) = delete;
+        Handle operator==(Handle&&) = delete;
+        ~Handle(); // frees from the pool
+
+      public:
+        void setup_command_buffers(const VkuDeviceDispatchTable& vtable) const;
+    };
+
+  private:
+    block allocate();
+
+  public:
+    TimestampPool(const VkDevice& device, const VkuDeviceDispatchTable& vtable,
+                  const VkCommandPool& command_pool);
+    TimestampPool(const TimestampPool&) = delete;
+    TimestampPool(TimestampPool&&) = delete;
+    TimestampPool operator==(const TimestampPool&) = delete;
+    TimestampPool operator==(TimestampPool&&) = delete;
+
+  public:
+    // Hands out a Handle with a pool and index of two uint64_t's.
+    std::unique_ptr<Handle> acquire();
+
+    void poll(); // saves the current state for future get's.
+
+    std::uint64_t get_polled(const Handle& handle);
+};
+
+} // namespace low_latency
+
+#endif
+\ No newline at end of file