aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt2
-rw-r--r--src/latency_controller.hh15
-rw-r--r--src/layer.cc626
-rw-r--r--src/layer.hh2
-rw-r--r--src/queue_context.cc51
-rw-r--r--src/queue_context.hh44
-rw-r--r--src/timestamp_pool.cc172
-rw-r--r--src/timestamp_pool.hh123
8 files changed, 821 insertions, 214 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 793e637..09daa4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,6 @@ add_custom_command(TARGET ${LIBRARY_NAME} POST_BUILD
add_custom_command(TARGET ${LIBRARY_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different
- "${CMAKE_CURRENT_SOURCE_DIR}/low_latency_layer.json"
+ "${CMAKE_CURRENT_SOURCE_DIR}/low_latency_layer.json"
"${OUTPUT_DIR}/"
)
diff --git a/src/latency_controller.hh b/src/latency_controller.hh
new file mode 100644
index 0000000..6672d5a
--- /dev/null
+++ b/src/latency_controller.hh
@@ -0,0 +1,15 @@
+#ifndef LATENCY_CONTROLLER_HH_
+#define LATENCY_CONTROLLER_HH_
+
+// The purpose of this file is to provide
+
+namespace low_latency {
+
+class LatencyController final {
+
+
+};
+
+};
+
+#endif \ No newline at end of file
diff --git a/src/layer.cc b/src/layer.cc
index 24cc519..94b4969 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -1,5 +1,6 @@
#include "layer.hh"
+#include <utility>
#include <vulkan/utility/vk_dispatch_table.h>
#include <vulkan/vk_layer.h>
#include <vulkan/vk_platform.h>
@@ -7,30 +8,31 @@
#include <vulkan/vulkan.hpp>
#include <vulkan/vulkan_core.h>
-#include <cstring>
+#include <deque>
#include <iostream>
#include <mutex>
#include <string_view>
#include <unordered_map>
+#include <unordered_set>
+
+#include "queue_context.hh"
+#include "timestamp_pool.hh"
namespace low_latency {
+// Global mutex for layer data.
static auto mutex = std::mutex{};
-struct command_stats {
- std::uint32_t num_draws;
- std::uint32_t num_instances;
- std::uint32_t num_verts;
-};
-static std::unordered_map<VkCommandBuffer, command_stats>
- commandbuffer_to_stats{};
-static std::unordered_map<void*, VkuInstanceDispatchTable> instance_dispatch;
-static std::unordered_map<void*, VkuDeviceDispatchTable> device_dispatch;
+// Mappings for device instances.
+static std::unordered_map<VkPhysicalDevice, VkInstance> device_instances;
+static std::unordered_map<void*, VkuInstanceDispatchTable> instance_vtables;
+static std::unordered_map<void*, VkuDeviceDispatchTable> device_vtables;
+
+static std::uint64_t current_frame = 0;
+static std::unordered_map<VkQueue, QueueContext> queue_contexts;
template <typename T>
concept DispatchableType =
- std::same_as<std::remove_cvref_t<T>, VkQueue> ||
- std::same_as<std::remove_cvref_t<T>, VkCommandBuffer> ||
std::same_as<std::remove_cvref_t<T>, VkInstance> ||
std::same_as<std::remove_cvref_t<T>, VkDevice> ||
std::same_as<std::remove_cvref_t<T>, VkPhysicalDevice>;
@@ -38,143 +40,72 @@ template <DispatchableType T> void* get_key(const T& inst) {
return *reinterpret_cast<void**>(inst);
}
-static VKAPI_ATTR VkResult VKAPI_CALL
-BeginCommandBuffer(VkCommandBuffer command_buffer,
- const VkCommandBufferBeginInfo* begin_info) {
- const auto lock = std::scoped_lock{mutex};
- commandbuffer_to_stats[command_buffer] = {};
- return device_dispatch[get_key(command_buffer)].BeginCommandBuffer(
- command_buffer, begin_info);
-}
-
-static VKAPI_ATTR void VKAPI_CALL CmdDraw(VkCommandBuffer command_buffer,
- std::uint32_t vertex_count,
- std::uint32_t instance_count,
- std::uint32_t first_vertex,
- std::uint32_t first_instance) {
-
- const auto lock = std::scoped_lock{mutex};
-
- if (const auto it = commandbuffer_to_stats.find(command_buffer);
- it != std::end(commandbuffer_to_stats)) {
-
- auto& stats = it->second;
- stats.num_draws++;
- stats.num_instances += instance_count;
- stats.num_verts += instance_count * vertex_count;
- }
-
- device_dispatch[get_key(command_buffer)].CmdDraw(
- command_buffer, vertex_count, instance_count, first_vertex,
- first_instance);
-}
-
-static VKAPI_ATTR void VKAPI_CALL CmdDrawIndexed(VkCommandBuffer command_buffer,
- uint32_t index_count,
- uint32_t instance_count,
- uint32_t first_index,
- int32_t vertex_offset,
- uint32_t first_instance) {
-
- const auto lock = std::scoped_lock{mutex};
-
- if (const auto it = commandbuffer_to_stats.find(command_buffer);
- it != std::end(commandbuffer_to_stats)) {
-
- auto& stats = it->second;
- stats.num_draws++;
- stats.num_instances += instance_count;
- stats.num_verts += instance_count * index_count;
- }
-
- device_dispatch[get_key(command_buffer)].CmdDrawIndexed(
- command_buffer, index_count, instance_count, first_index, vertex_offset,
- first_instance);
-}
-
-static VKAPI_ATTR VkResult VKAPI_CALL
-EndCommandBuffer(VkCommandBuffer command_buffer) {
-
- const auto lock = std::scoped_lock{mutex};
+template <typename T, typename sType>
+static T* get_link_info(const void* const head, const sType& stype) {
+ for (auto i = reinterpret_cast<const VkBaseInStructure*>(head); i;
+ i = i->pNext) {
- const auto& s = commandbuffer_to_stats[command_buffer];
+ if (i->sType != stype) {
+ continue;
+ }
- std::cout << std::format("Command buffer ended with {} draws, {} "
- "instances and {} vertices\n",
- s.num_draws, s.num_instances, s.num_verts);
+ const auto info = reinterpret_cast<const T*>(i);
+ if (info->function != VK_LAYER_LINK_INFO) {
+ continue;
+ }
- const auto it = device_dispatch.find(get_key(command_buffer));
- if (it == std::end(device_dispatch)) {
- return VK_ERROR_DEVICE_LOST;
+ return const_cast<T*>(info);
}
- return it->second.EndCommandBuffer(command_buffer);
+ return nullptr;
}
static VKAPI_ATTR VkResult VKAPI_CALL
CreateInstance(const VkInstanceCreateInfo* pCreateInfo,
const VkAllocationCallbacks* pAllocator, VkInstance* pInstance) {
- // Iterate through list starting at pNext until we see create_info and
- // link_info.
- auto layer_create_info = [&]() -> VkLayerInstanceCreateInfo* {
- for (auto base =
- reinterpret_cast<const VkBaseInStructure*>(pCreateInfo->pNext);
- base; base = base->pNext) {
-
- if (base->sType != VK_STRUCTURE_TYPE_LOADER_INSTANCE_CREATE_INFO) {
- continue;
- }
+ const auto link_info = get_link_info<VkLayerInstanceCreateInfo>(
+ pCreateInfo->pNext, VK_STRUCTURE_TYPE_LOADER_INSTANCE_CREATE_INFO);
- const auto info =
- reinterpret_cast<const VkLayerInstanceCreateInfo*>(base);
- if (info->function != VK_LAYER_LINK_INFO) {
- continue;
- }
- return const_cast<VkLayerInstanceCreateInfo*>(info);
- }
- return nullptr;
- }();
-
- if (!layer_create_info || !layer_create_info->u.pLayerInfo) {
+ if (!link_info || !link_info->u.pLayerInfo) {
return VK_ERROR_INITIALIZATION_FAILED;
}
// Store our get instance proc addr function and pop it off our list +
// advance the list so future layers know what to call.
- const auto next_gipa =
- layer_create_info->u.pLayerInfo->pfnNextGetInstanceProcAddr;
- if (!next_gipa) {
+ const auto gipa = link_info->u.pLayerInfo->pfnNextGetInstanceProcAddr;
+ if (!gipa) {
return VK_ERROR_INITIALIZATION_FAILED;
}
- layer_create_info->u.pLayerInfo = layer_create_info->u.pLayerInfo->pNext;
+ link_info->u.pLayerInfo = link_info->u.pLayerInfo->pNext;
// Call our create instance func, and store vkDestroyInstance, and
// vkCreateDevice as well.
- const auto create_instance_func = reinterpret_cast<PFN_vkCreateInstance>(
- next_gipa(VK_NULL_HANDLE, "vkCreateInstance"));
- if (!create_instance_func) {
+ const auto create_instance = reinterpret_cast<PFN_vkCreateInstance>(
+ gipa(VK_NULL_HANDLE, "vkCreateInstance"));
+ if (!create_instance) {
return VK_ERROR_INITIALIZATION_FAILED;
}
- if (const auto result =
- create_instance_func(pCreateInfo, pAllocator, pInstance);
+ if (const auto result = create_instance(pCreateInfo, pAllocator, pInstance);
result != VK_SUCCESS) {
return result;
}
const auto lock = std::scoped_lock{mutex};
- instance_dispatch.emplace(
+ instance_vtables.emplace(
get_key(*pInstance),
VkuInstanceDispatchTable{
.DestroyInstance = reinterpret_cast<PFN_vkDestroyInstance>(
- next_gipa(*pInstance, "vkDestroyInstance")),
+ gipa(*pInstance, "vkDestroyInstance")),
+ .EnumeratePhysicalDevices =
+ reinterpret_cast<PFN_vkEnumeratePhysicalDevices>(
+ gipa(*pInstance, "vkEnumeratePhysicalDevices")),
.GetInstanceProcAddr = reinterpret_cast<PFN_vkGetInstanceProcAddr>(
- next_gipa(*pInstance, "vkGetInstanceProcAddr")),
+ gipa(*pInstance, "vkGetInstanceProcAddr")),
.EnumerateDeviceExtensionProperties =
reinterpret_cast<PFN_vkEnumerateDeviceExtensionProperties>(
- next_gipa(*pInstance,
- "vkEnumerateDeviceExtensionProperties")),
+ gipa(*pInstance, "vkEnumerateDeviceExtensionProperties")),
}
);
@@ -186,75 +117,195 @@ static VKAPI_ATTR void VKAPI_CALL
DestroyInstance(VkInstance instance, const VkAllocationCallbacks* allocator) {
const auto lock = std::scoped_lock{mutex};
- instance_dispatch.erase(get_key(instance));
+
+ const auto key = get_key(instance);
+ assert(instance_vtables.contains(key));
+ instance_vtables.erase(key);
+}
+
+static VKAPI_ATTR VkResult VKAPI_CALL EnumeratePhysicalDevices(
+ VkInstance instance, std::uint32_t* count, VkPhysicalDevice* devices) {
+
+ const auto lock = std::scoped_lock{mutex};
+
+ const auto it = instance_vtables.find(get_key(instance));
+ assert(it != std::end(instance_vtables));
+ const auto& vtable = it->second;
+
+ if (const auto result =
+ vtable.EnumeratePhysicalDevices(instance, count, devices);
+ !devices || result != VK_SUCCESS) {
+
+ return result;
+ }
+
+ for (auto i = std::uint32_t{0}; i < *count; ++i) {
+ device_instances.emplace(devices[i], instance);
+ }
+
+ return VK_SUCCESS;
}
static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
VkPhysicalDevice physical_device, const VkDeviceCreateInfo* pCreateInfo,
const VkAllocationCallbacks* pAllocator, VkDevice* pDevice) {
- auto layer_create_info = [&]() -> VkLayerDeviceCreateInfo* {
- for (auto base =
- reinterpret_cast<const VkBaseInStructure*>(pCreateInfo->pNext);
- base; base = base->pNext) {
+ const auto create_info = get_link_info<VkLayerDeviceCreateInfo>(
+ pCreateInfo->pNext, VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO);
+ if (!create_info || !create_info->u.pLayerInfo) {
+ return VK_ERROR_INITIALIZATION_FAILED;
+ }
+
+ const auto gipa = create_info->u.pLayerInfo->pfnNextGetInstanceProcAddr;
+ const auto gdpa = create_info->u.pLayerInfo->pfnNextGetDeviceProcAddr;
+ if (!gipa || !gdpa) {
+ return VK_ERROR_INITIALIZATION_FAILED;
+ }
+ create_info->u.pLayerInfo = create_info->u.pLayerInfo->pNext;
+
+ const auto lock = std::scoped_lock{mutex};
+
+ const auto next_extensions =
+ [&]() -> std::optional<std::vector<const char*>> {
+ const auto supported_extensions =
+ [&]() -> std::optional<std::vector<VkExtensionProperties>> {
+ const auto enumerate_device_extensions =
+ reinterpret_cast<PFN_vkEnumerateDeviceExtensionProperties>(
+ gipa(device_instances[physical_device],
+ "vkEnumerateDeviceExtensionProperties"));
+ if (!enumerate_device_extensions) {
+ return std::nullopt;
+ }
+
+ auto count = std::uint32_t{};
+ if (enumerate_device_extensions(physical_device, nullptr, &count,
+ nullptr) != VK_SUCCESS) {
+
+ return std::nullopt;
+ }
+
+ auto supported_extensions =
+ std::vector<VkExtensionProperties>(count);
+ if (enumerate_device_extensions(physical_device, nullptr, &count,
+ std::data(supported_extensions)) !=
+ VK_SUCCESS) {
+
+ return std::nullopt;
+ }
+
+ return supported_extensions;
+ }();
+
+ auto next_extensions =
+ std::vector{*pCreateInfo->ppEnabledExtensionNames,
+ std::next(*pCreateInfo->ppEnabledExtensionNames +
+ pCreateInfo->enabledExtensionCount)};
+
+ const auto wanted_extensions = {
+ VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME,
+ VK_KHR_CALIBRATED_TIMESTAMPS_EXTENSION_NAME,
+ VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME};
+
+ for (const auto& wanted : wanted_extensions) {
+
+ if (std::ranges::any_of(
+ next_extensions, [&](const auto& next_extension) {
+ return !std::strcmp(next_extension, wanted);
+ })) {
- if (base->sType != VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO) {
- continue;
+ continue; // Already included, ignore it.
}
- const auto info =
- reinterpret_cast<const VkLayerDeviceCreateInfo*>(base);
+ if (std::ranges::none_of(*supported_extensions,
+ [&](const auto& supported_extension) {
+ return !std::strcmp(
+ supported_extension.extensionName,
+ wanted);
+ })) {
- if (info->function != VK_LAYER_LINK_INFO) {
- continue;
+ return std::nullopt; // We don't support it, the layer can't
+ // work.
}
- return const_cast<VkLayerDeviceCreateInfo*>(info);
+ next_extensions.push_back(wanted);
}
- return nullptr;
+
+ return next_extensions;
}();
- if (!layer_create_info || !layer_create_info->u.pLayerInfo) {
+ if (!next_extensions.has_value()) {
return VK_ERROR_INITIALIZATION_FAILED;
}
- const auto next_gipa =
- layer_create_info->u.pLayerInfo->pfnNextGetInstanceProcAddr;
- const auto next_gdpa =
- layer_create_info->u.pLayerInfo->pfnNextGetDeviceProcAddr;
- if (!next_gipa || !next_gdpa) {
+ const auto create_device = reinterpret_cast<PFN_vkCreateDevice>(
+ gipa(VK_NULL_HANDLE, "vkCreateDevice"));
+ if (!create_device) {
return VK_ERROR_INITIALIZATION_FAILED;
}
- layer_create_info->u.pLayerInfo = layer_create_info->u.pLayerInfo->pNext;
- const auto create_func = reinterpret_cast<PFN_vkCreateDevice>(
- next_gipa(VK_NULL_HANDLE, "vkCreateDevice"));
- if (!create_func) {
- return VK_ERROR_INITIALIZATION_FAILED;
- }
+ const auto next_create_info = [&]() -> VkDeviceCreateInfo {
+ auto next_pCreateInfo = *pCreateInfo;
+ next_pCreateInfo.ppEnabledExtensionNames = std::data(*next_extensions);
+ next_pCreateInfo.enabledExtensionCount = std::size(*next_extensions);
+ return next_pCreateInfo;
+ }();
- if (const auto result =
- create_func(physical_device, pCreateInfo, pAllocator, pDevice);
+ if (const auto result = create_device(physical_device, &next_create_info,
+ pAllocator, pDevice);
result != VK_SUCCESS) {
+
return result;
}
- const auto lock = std::scoped_lock{mutex};
- device_dispatch.emplace(
+ device_vtables.emplace(
get_key(*pDevice),
VkuDeviceDispatchTable{
.GetDeviceProcAddr = reinterpret_cast<PFN_vkGetDeviceProcAddr>(
- next_gdpa(*pDevice, "vkGetDeviceProcAddr")),
+ gdpa(*pDevice, "vkGetDeviceProcAddr")),
.DestroyDevice = reinterpret_cast<PFN_vkDestroyDevice>(
- next_gdpa(*pDevice, "vkDestroyDevice")),
+ gdpa(*pDevice, "vkDestroyDevice")),
+ .GetDeviceQueue = reinterpret_cast<PFN_vkGetDeviceQueue>(
+ gdpa(*pDevice, "vkGetDeviceQueue")),
+ .QueueSubmit = reinterpret_cast<PFN_vkQueueSubmit>(
+ gdpa(*pDevice, "vkQueueSubmit")),
+ .CreateSemaphore = reinterpret_cast<PFN_vkCreateSemaphore>(
+ gdpa(*pDevice, "vkCreateSemaphore")),
+ .CreateQueryPool = reinterpret_cast<PFN_vkCreateQueryPool>(
+ gdpa(*pDevice, "vkCreateQueryPool")),
+ .GetQueryPoolResults = reinterpret_cast<PFN_vkGetQueryPoolResults>(
+ gdpa(*pDevice, "vkGetQueryPoolResults")),
+ .CreateCommandPool = reinterpret_cast<PFN_vkCreateCommandPool>(
+ gdpa(*pDevice, "vkCreateCommandPool")),
+ .AllocateCommandBuffers =
+ reinterpret_cast<PFN_vkAllocateCommandBuffers>(
+ gdpa(*pDevice, "vkAllocateCommandBuffers")),
.BeginCommandBuffer = reinterpret_cast<PFN_vkBeginCommandBuffer>(
- next_gdpa(*pDevice, "vkBeginCommandBuffer")),
+ gdpa(*pDevice, "vkBeginCommandBuffer")),
.EndCommandBuffer = reinterpret_cast<PFN_vkEndCommandBuffer>(
- next_gdpa(*pDevice, "vkEndCommandBuffer")),
- .CmdDraw = reinterpret_cast<PFN_vkCmdDraw>(
- next_gdpa(*pDevice, "vkCmdDraw")),
+ gdpa(*pDevice, "vkEndCommandBuffer")),
+ .ResetCommandBuffer = reinterpret_cast<PFN_vkResetCommandBuffer>(
+ gdpa(*pDevice, "vkResetCommandBuffer")),
+ .CmdDraw =
+ reinterpret_cast<PFN_vkCmdDraw>(gdpa(*pDevice, "vkCmdDraw")),
.CmdDrawIndexed = reinterpret_cast<PFN_vkCmdDrawIndexed>(
- next_gdpa(*pDevice, "vkCmdDrawIndexed")),
+ gdpa(*pDevice, "vkCmdDrawIndexed")),
+ .CmdResetQueryPool = reinterpret_cast<PFN_vkCmdResetQueryPool>(
+ gdpa(*pDevice, "vkCmdResetQueryPool")),
+ .GetDeviceQueue2 = reinterpret_cast<PFN_vkGetDeviceQueue2>(
+ gdpa(*pDevice, "vkGetDeviceQueue2")),
+ .QueueSubmit2 = reinterpret_cast<PFN_vkQueueSubmit2>(
+ gdpa(*pDevice, "vkQueueSubmit2")),
+ .QueuePresentKHR = reinterpret_cast<PFN_vkQueuePresentKHR>(
+ gdpa(*pDevice, "vkQueuePresentKHR")),
+ .GetSemaphoreCounterValueKHR =
+ reinterpret_cast<PFN_vkGetSemaphoreCounterValueKHR>(
+ gdpa(*pDevice, "vkGetSemaphoreCounterValueKHR")),
+ .CmdWriteTimestamp2KHR =
+ reinterpret_cast<PFN_vkCmdWriteTimestamp2KHR>(
+ gdpa(*pDevice, "vkCmdWriteTimestamp2KHR")),
+ .QueueSubmit2KHR = reinterpret_cast<PFN_vkQueueSubmit2KHR>(
+ gdpa(*pDevice, "vkQueueSubmit2KHR")),
+
});
return VK_SUCCESS;
@@ -264,68 +315,225 @@ static VKAPI_ATTR void VKAPI_CALL
DestroyDevice(VkDevice device, const VkAllocationCallbacks* allocator) {
const auto lock = std::scoped_lock{mutex};
- device_dispatch.erase(get_key(device));
+ const auto key = get_key(device);
+ assert(device_vtables.contains(key));
+ device_vtables.erase(key);
}
-// These are wrong, the tutorial isn't correct afaik.
-static VKAPI_ATTR VkResult VKAPI_CALL EnumerateInstanceLayerProperties(
- std::uint32_t* pPropertyCount, VkLayerProperties* pProperties) {
+// Small amount of duplication, we can't assume gdq2 is available apparently.
+static VKAPI_ATTR void VKAPI_CALL
+GetDeviceQueue(VkDevice device, std::uint32_t queue_family_index,
+ std::uint32_t queue_index, VkQueue* queue) {
+
+ const auto lock = std::scoped_lock{mutex};
+ const auto& vtable = device_vtables[get_key(device)];
+
+ vtable.GetDeviceQueue(device, queue_family_index, queue_index, queue);
+ if (!queue || !*queue) {
+ return;
+ }
- if (pPropertyCount) {
- *pPropertyCount = 1;
+ if (!queue_contexts.contains(*queue)) {
+ queue_contexts.emplace(
+ std::piecewise_construct, std::forward_as_tuple(*queue),
+ std::forward_as_tuple(device, *queue, queue_family_index, vtable));
}
+}
+
+static VKAPI_ATTR void VKAPI_CALL GetDeviceQueue2(
+ VkDevice device, const VkDeviceQueueInfo2* info, VkQueue* queue) {
+
+ const auto lock = std::scoped_lock{mutex};
+ const auto& vtable = device_vtables[get_key(device)];
- if (pProperties) {
- std::strcpy(pProperties->layerName, LAYER_NAME);
- std::strcpy(pProperties->description, "Low Latency Layer");
- pProperties->implementationVersion = 1;
- pProperties->specVersion = VK_API_VERSION_1_3;
+ vtable.GetDeviceQueue2(device, info, queue);
+ if (!queue || !*queue) {
+ return;
}
- return VK_SUCCESS;
+ if (!queue_contexts.contains(*queue)) {
+ queue_contexts.emplace(
+ std::piecewise_construct, std::forward_as_tuple(*queue),
+ std::forward_as_tuple(device, *queue, info->queueFamilyIndex,
+ vtable));
+ }
}
-static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceLayerProperties(
- VkPhysicalDevice physical_device, uint32_t* pPropertyCount,
- VkLayerProperties* pProperties) {
+static VKAPI_ATTR VkResult VKAPI_CALL
+vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
+ const VkSubmitInfo* submit_info, VkFence fence) {
+
+ const auto lock = std::scoped_lock{mutex};
+
+ auto& queue_context = [&]() -> auto& {
+ const auto& queue_context_it = queue_contexts.find(queue);
+ assert(queue_context_it != std::end(queue_contexts));
+ return queue_context_it->second;
+ }();
+ const auto& vtable = device_vtables[get_key(queue_context.device)];
+
+ if (!submit_count) { // no-op submit we shouldn't worry about
+ return vtable.QueueSubmit(queue, submit_count, submit_info, fence);
+ }
+
+ // Create a new vector of submit infos, copy their existing ones.
+ auto next_submit_infos = std::vector<VkSubmitInfo>{};
+ next_submit_infos.reserve(submit_count + 2);
+
+ auto timestamp_handle = queue_context.timestamp_pool.acquire();
+ timestamp_handle->setup_command_buffers(vtable);
+
+ const auto& [head_cb, tail_cb] = timestamp_handle->command_buffers;
+
+ // The first submit info we use will steal their wait semaphores.
+ next_submit_infos.push_back(VkSubmitInfo{
+ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+ .pNext = submit_info->pNext,
+ .waitSemaphoreCount = submit_info[0].waitSemaphoreCount,
+ .pWaitSemaphores = submit_info[0].pWaitSemaphores,
+ .pWaitDstStageMask = submit_info[0].pWaitDstStageMask,
+ .commandBufferCount = 1,
+ .pCommandBuffers = &head_cb,
+ });
+
+ // Fill in original submit infos but erase the wait semaphores on the
+ // first because we stole them earlier.
+ std::ranges::copy_n(submit_info, submit_count,
+ std::back_inserter(next_submit_infos));
+ next_submit_infos[1].pWaitSemaphores = nullptr;
+ next_submit_infos[1].waitSemaphoreCount = 0u;
+
+ const auto TODO_next = std::uint64_t{current_frame + 1};
+ const auto tail_tssi = VkTimelineSemaphoreSubmitInfo{
+ .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR,
+ .signalSemaphoreValueCount = 1,
+ .pSignalSemaphoreValues = &TODO_next,
+ };
+ next_submit_infos.push_back(VkSubmitInfo{
+ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+ .pNext = &tail_tssi,
+ .commandBufferCount = 1,
+ .pCommandBuffers = &tail_cb,
+ .signalSemaphoreCount = 1,
+ .pSignalSemaphores = &queue_context.semaphore,
+ });
+
+ if (const auto res =
+ vtable.QueueSubmit(queue, std::size(next_submit_infos),
+ std::data(next_submit_infos), fence);
+ res != VK_SUCCESS) {
+
+ return res;
+ }
- return EnumerateInstanceLayerProperties(pPropertyCount, pProperties);
+ return VK_SUCCESS;
}
-static VKAPI_ATTR VkResult VKAPI_CALL EnumerateInstanceExtensionProperties(
- const char* pLayerName, uint32_t* pPropertyCount,
- VkExtensionProperties* pProperties) {
+// The logic for this function is identical to vkSubmitInfo.
+static VKAPI_ATTR VkResult VKAPI_CALL
+vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
+ const VkSubmitInfo2* submit_infos, VkFence fence) {
- if (!pLayerName || std::string_view{pLayerName} != LAYER_NAME) {
+ const auto lock = std::scoped_lock{mutex};
+ auto& queue_context = [&]() -> auto& {
+ const auto& queue_context_it = queue_contexts.find(queue);
+ assert(queue_context_it != std::end(queue_contexts));
+ return queue_context_it->second;
+ }();
+ const auto& vtable = device_vtables[get_key(queue_context.device)];
- return VK_ERROR_LAYER_NOT_PRESENT;
+ if (!submit_count) { // another no-op submit
+ return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
}
- if (pPropertyCount) {
- *pPropertyCount = 0;
+ auto next_submit_infos = std::vector<VkSubmitInfo2>();
+ next_submit_infos.reserve(submit_count + 2);
+
+ auto timestamp_handle = queue_context.timestamp_pool.acquire();
+ timestamp_handle->setup_command_buffers(vtable);
+ const auto& [head_cb, tail_cb] = timestamp_handle->command_buffers;
+
+ const auto head_cb_info = VkCommandBufferSubmitInfo{
+ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+ .commandBuffer = head_cb,
+ };
+ next_submit_infos.push_back(VkSubmitInfo2{
+ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
+ .waitSemaphoreInfoCount = submit_infos[0].waitSemaphoreInfoCount,
+ .pWaitSemaphoreInfos = submit_infos[0].pWaitSemaphoreInfos,
+ .commandBufferInfoCount = 1,
+ .pCommandBufferInfos = &head_cb_info,
+ });
+ std::ranges::copy_n(submit_infos, submit_count,
+ std::back_inserter(next_submit_infos));
+ next_submit_infos[1].pWaitSemaphoreInfos = nullptr;
+ next_submit_infos[1].waitSemaphoreInfoCount = 0;
+
+ const auto tail_cb_info = VkCommandBufferSubmitInfo{
+ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+ .commandBuffer = tail_cb,
+ };
+ next_submit_infos.push_back(VkSubmitInfo2{
+ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
+ .waitSemaphoreInfoCount = submit_infos[0].waitSemaphoreInfoCount,
+ .pWaitSemaphoreInfos = submit_infos[0].pWaitSemaphoreInfos,
+ .commandBufferInfoCount = 1,
+ .pCommandBufferInfos = &tail_cb_info,
+ });
+
+ if (const auto res =
+ vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
+ res != VK_SUCCESS) {
+ return res;
}
+
return VK_SUCCESS;
}
-static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceExtensionProperties(
- VkPhysicalDevice physical_device, const char* pLayerName,
- uint32_t* pPropertyCount, VkExtensionProperties* pProperties) {
+static VKAPI_ATTR VkResult VKAPI_CALL
+vkQueueSubmit2KHR(VkQueue queue, std::uint32_t submit_count,
+ const VkSubmitInfo2* submit_info, VkFence fence) {
+ // Just forward to low_latency::vkQueueSubmit2 here.
+ return low_latency::vkQueueSubmit2(queue, submit_count, submit_info, fence);
+}
- if (!pLayerName || std::string_view{pLayerName} != LAYER_NAME) {
+static VKAPI_ATTR VkResult VKAPI_CALL
+vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) {
- if (physical_device == VK_NULL_HANDLE) {
- return VK_SUCCESS;
- }
+ const auto lock = std::scoped_lock{mutex};
+ auto& queue_context = [&]() -> auto& {
+ const auto& queue_context_it = queue_contexts.find(queue);
+ assert(queue_context_it != std::end(queue_contexts));
+ return queue_context_it->second;
+ }();
+ const auto& vtable = device_vtables[get_key(queue_context.device)];
- const auto lock = std::scoped_lock{mutex};
- return instance_dispatch[get_key(physical_device)]
- .EnumerateDeviceExtensionProperties(physical_device, pLayerName,
- pPropertyCount, pProperties);
+ if (const auto res = vtable.QueuePresentKHR(queue, present_info);
+ res != VK_SUCCESS) {
+
+ return res;
}
- if (pPropertyCount) {
- *pPropertyCount = 0;
+ std::cout << "queuePresentKHR called for queue " << queue << '\n';
+
+ // Update all of our information about this queue's timestamp pool!
+ queue_context.timestamp_pool.poll();
+
+ // While we might be submitting on this queue, let's see what our timeline
+ // semaphore says we're at.
+ uint64_t value = 0;
+ if (const auto res = vtable.GetSemaphoreCounterValueKHR(
+ queue_context.device, queue_context.semaphore, &value);
+ res != VK_SUCCESS) {
+
+ return res;
}
+
+ std::cout << " frame_index: " << current_frame << '\n';
+ std::cout << " semaphore: " << value << '\n';
+ std::cout << " queue: " << queue << '\n';
+
+ ++current_frame;
return VK_SUCCESS;
}
@@ -336,17 +544,14 @@ static const auto instance_functions =
{"vkGetInstanceProcAddr",
reinterpret_cast<PFN_vkVoidFunction>(LowLatency_GetInstanceProcAddr)},
- {"vkEnumerateInstanceLayerProperties",
- reinterpret_cast<PFN_vkVoidFunction>(
- low_latency::EnumerateInstanceLayerProperties)},
- {"vkEnumerateInstanceExtensionProperties",
- reinterpret_cast<PFN_vkVoidFunction>(
- low_latency::EnumerateInstanceExtensionProperties)},
-
{"vkCreateInstance",
reinterpret_cast<PFN_vkVoidFunction>(low_latency::CreateInstance)},
{"vkDestroyInstance",
reinterpret_cast<PFN_vkVoidFunction>(low_latency::DestroyInstance)},
+
+ {"vkEnumeratePhysicalDevices",
+ reinterpret_cast<PFN_vkVoidFunction>(
+ low_latency::EnumeratePhysicalDevices)},
};
static const auto device_functions =
@@ -354,27 +559,23 @@ static const auto device_functions =
{"vkGetDeviceProcAddr",
reinterpret_cast<PFN_vkVoidFunction>(LowLatency_GetDeviceProcAddr)},
- {"vkEnumerateDeviceLayerProperties",
- reinterpret_cast<PFN_vkVoidFunction>(
- low_latency::EnumerateDeviceLayerProperties)},
- {"vkEnumerateDeviceExtensionProperties",
- reinterpret_cast<PFN_vkVoidFunction>(
- low_latency::EnumerateDeviceExtensionProperties)},
-
{"vkCreateDevice",
reinterpret_cast<PFN_vkVoidFunction>(low_latency::CreateDevice)},
{"vkDestroyDevice",
reinterpret_cast<PFN_vkVoidFunction>(low_latency::DestroyDevice)},
- {"vkCmdDraw",
- reinterpret_cast<PFN_vkVoidFunction>(low_latency::CmdDraw)},
- {"vkCmdDrawIndexed",
- reinterpret_cast<PFN_vkVoidFunction>(low_latency::CmdDrawIndexed)},
+ {"vkGetDeviceQueue",
+ reinterpret_cast<PFN_vkVoidFunction>(low_latency::GetDeviceQueue)},
+ {"vkGetDeviceQueue2",
+ reinterpret_cast<PFN_vkVoidFunction>(low_latency::GetDeviceQueue2)},
- {"vkBeginCommandBuffer",
- reinterpret_cast<PFN_vkVoidFunction>(low_latency::BeginCommandBuffer)},
- {"vkEndCommandBuffer",
- reinterpret_cast<PFN_vkVoidFunction>(low_latency::EndCommandBuffer)},
+ {"vkQueueSubmit",
+ reinterpret_cast<PFN_vkVoidFunction>(low_latency::vkQueueSubmit)},
+ {"vkQueueSubmit2",
+ reinterpret_cast<PFN_vkVoidFunction>(low_latency::vkQueueSubmit2)},
+
+ {"vkQueuePresentKHR",
+ reinterpret_cast<PFN_vkVoidFunction>(low_latency::vkQueuePresentKHR)},
};
VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
@@ -387,7 +588,7 @@ LowLatency_GetDeviceProcAddr(VkDevice device, const char* const pName) {
}
const auto lock = std::scoped_lock{low_latency::mutex};
- return low_latency::device_dispatch[low_latency::get_key(device)]
+ return low_latency::device_vtables[low_latency::get_key(device)]
.GetDeviceProcAddr(device, pName);
}
@@ -395,14 +596,13 @@ VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
LowLatency_GetInstanceProcAddr(VkInstance instance, const char* const pName) {
for (const auto& functions : {device_functions, instance_functions}) {
- const auto it = functions.find(pName);
- if (it == std::end(functions)) {
- continue;
+
+ if (const auto it = functions.find(pName); it != std::end(functions)) {
+ return it->second;
}
- return it->second;
}
const auto lock = std::scoped_lock{low_latency::mutex};
- return low_latency::instance_dispatch[low_latency::get_key(instance)]
+ return low_latency::instance_vtables[low_latency::get_key(instance)]
.GetInstanceProcAddr(instance, pName);
} \ No newline at end of file
diff --git a/src/layer.hh b/src/layer.hh
index 5633c63..08152f2 100644
--- a/src/layer.hh
+++ b/src/layer.hh
@@ -4,6 +4,8 @@
#include <vulkan/vk_platform.h>
#include <vulkan/vulkan.hpp>
+// The purpose of this file is to expose a header entry point for our layer.
+
extern "C" {
VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL
diff --git a/src/queue_context.cc b/src/queue_context.cc
new file mode 100644
index 0000000..dbae4c0
--- /dev/null
+++ b/src/queue_context.cc
@@ -0,0 +1,51 @@
+#include "queue_context.hh"
+
+namespace low_latency {
+
+static VkCommandPool make_command_pool(const VkDevice& device,
+ const std::uint32_t& queue_family_index,
+ const VkuDeviceDispatchTable& vtable) {
+
+ const auto cpci = VkCommandPoolCreateInfo{
+ .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+ .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
+ VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+ .queueFamilyIndex = queue_family_index,
+ };
+
+ auto command_pool = VkCommandPool{};
+ vtable.CreateCommandPool(device, &cpci, nullptr, &command_pool);
+ return command_pool;
+}
+
+QueueContext::QueueContext(const VkDevice& device, const VkQueue queue,
+ const std::uint32_t& queue_family_index,
+ const VkuDeviceDispatchTable& vtable)
+ : device(device), queue(queue), queue_family_index(queue_family_index),
+ vtable(vtable),
+ // Important we make the command pool before the timestamp pool, because it's a dependency.
+ command_pool(make_command_pool(device, queue_family_index, vtable)),
+ timestamp_pool(device, vtable, command_pool) {
+
+ this->semaphore = [&]() -> VkSemaphore {
+ const auto stci = VkSemaphoreTypeCreateInfo{
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO,
+ .semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE,
+ .initialValue = 0,
+ };
+
+ const auto sci = VkSemaphoreCreateInfo{
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+ .pNext = &stci,
+ };
+
+ auto semaphore = VkSemaphore{};
+ vtable.CreateSemaphore(device, &sci, nullptr, &semaphore);
+ return semaphore;
+ }();
+}
+
+QueueContext::~QueueContext() {
+}
+
+} // namespace low_latency \ No newline at end of file
diff --git a/src/queue_context.hh b/src/queue_context.hh
new file mode 100644
index 0000000..eb3f2ea
--- /dev/null
+++ b/src/queue_context.hh
@@ -0,0 +1,44 @@
+#ifndef QUEUE_STATE_HH_
+#define QUEUE_STATE_HH_
+
+#include "timestamp_pool.hh"
+
+#include <vulkan/utility/vk_dispatch_table.h>
+#include <vulkan/vulkan.hpp>
+
+#include <deque>
+#include <vector>
+
+namespace low_latency {
+
+class QueueContext final {
+ public:
+ VkDevice device;
+ VkuDeviceDispatchTable vtable;
+
+ VkQueue queue;
+ std::uint32_t queue_family_index;
+
+ VkSemaphore semaphore;
+ VkCommandPool command_pool;
+
+ TimestampPool timestamp_pool;
+
+ std::deque<
+ std::vector<std::pair<TimestampPool::Handle, TimestampPool::Handle>>>
+ tracked_queues;
+
+ public:
+ QueueContext(const VkDevice& device, const VkQueue queue,
+ const std::uint32_t& queue_family_index,
+ const VkuDeviceDispatchTable& vtable);
+ QueueContext(const QueueContext&) = delete;
+ QueueContext(QueueContext&&) = delete;
+ QueueContext operator==(const QueueContext&) = delete;
+ QueueContext operator==(QueueContext&&) = delete;
+ ~QueueContext();
+};
+
+}; // namespace low_latency
+
+#endif \ No newline at end of file
diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc
new file mode 100644
index 0000000..1dc37b2
--- /dev/null
+++ b/src/timestamp_pool.cc
@@ -0,0 +1,172 @@
+#include "timestamp_pool.hh"
+
+#include <ranges>
+#include <vulkan/vulkan_core.h>
+
+namespace low_latency {
+
+TimestampPool::block TimestampPool::allocate() {
+ const auto query_pool = [&]() -> VkQueryPool {
+ const auto qpci = VkQueryPoolCreateInfo{
+ .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+ .queryType = VK_QUERY_TYPE_TIMESTAMP,
+ .queryCount = this->TIMESTAMP_QUERY_POOL_SIZE};
+
+ auto query_pool = VkQueryPool{};
+ vtable.CreateQueryPool(device, &qpci, nullptr, &query_pool);
+ return query_pool;
+ }();
+
+ const auto key_range =
+ std::views::iota(0u, this->TIMESTAMP_QUERY_POOL_SIZE / 2) |
+ std::views::transform([](const std::uint64_t& i) { return 2 * i; });
+
+ const auto available_keys = std::make_shared<available_query_indicies_t>(
+ available_query_indicies_t{std::begin(key_range), std::end(key_range)});
+
+ auto command_buffers = [this]() -> auto {
+ auto command_buffers =
+ std::vector<VkCommandBuffer>(this->TIMESTAMP_QUERY_POOL_SIZE);
+
+ const auto cbai = VkCommandBufferAllocateInfo{
+ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+ .commandPool = this->command_pool,
+ .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+ .commandBufferCount =
+ static_cast<std::uint32_t>(std::size(command_buffers)),
+ };
+ vtable.AllocateCommandBuffers(device, &cbai,
+ std::data(command_buffers));
+ return std::make_unique<std::vector<VkCommandBuffer>>(command_buffers);
+ }();
+
+ return block{.query_pool = query_pool,
+ .available_indicies = available_keys,
+ .command_buffers = std::move(command_buffers)};
+}
+
+TimestampPool::TimestampPool(const VkDevice& device,
+ const VkuDeviceDispatchTable& vtable,
+ const VkCommandPool& command_pool)
+ : device(device), vtable(vtable), command_pool(command_pool) {
+
+ // Allocate one block on construction, it's likely more than enough!
+ this->blocks.emplace_back(this->allocate());
+}
+
+std::unique_ptr<TimestampPool::Handle> TimestampPool::acquire() {
+ const auto& vacant_iter = [this]() -> auto {
+ const auto it =
+ std::ranges::find_if(this->blocks, [](const auto& block) {
+ return std::size(*block.available_indicies);
+ });
+
+ if (it != std::end(this->blocks)) {
+ return it;
+ }
+ this->blocks.emplace_back(this->allocate());
+ return std::prev(std::end(this->blocks));
+ }();
+
+ const auto query_pool = vacant_iter->query_pool;
+ auto& available_indices = vacant_iter->available_indicies;
+
+ // Grab any element from our set and erase it immediately after.
+ const auto query_index = *std::begin(*available_indices);
+ available_indices->erase(std::begin(*available_indices));
+
+ const auto command_buffers = [&]() -> auto {
+ auto command_buffers = std::array<VkCommandBuffer, 2>{};
+ std::ranges::copy_n(
+ std::next(std::begin(*vacant_iter->command_buffers), query_index),
+ std::size(command_buffers), std::begin(command_buffers));
+ return command_buffers;
+ }();
+
+ const auto block_index = static_cast<std::size_t>(
+ std::distance(std::begin(this->blocks), vacant_iter));
+
+ return std::make_unique<Handle>(available_indices, block_index, query_pool,
+ query_index, command_buffers);
+}
+
+TimestampPool::Handle::Handle(
+ const std::weak_ptr<TimestampPool::available_query_indicies_t>&
+ index_origin,
+ const std::size_t block_index, const VkQueryPool& query_pool,
+ const std::uint64_t query_index,
+ const std::array<VkCommandBuffer, 2>& command_buffers)
+ : index_origin(index_origin), block_index(block_index),
+ query_pool(query_pool), query_index(query_index),
+ command_buffers(command_buffers) {}
+
+TimestampPool::Handle::~Handle() {
+ if (const auto origin = this->index_origin.lock(); origin) {
+ assert(!origin->contains(this->query_index));
+ origin->insert(this->query_index);
+ }
+}
+
+void TimestampPool::Handle::setup_command_buffers(
+ const VkuDeviceDispatchTable& vtable) const {
+
+ const auto& [head, tail] = this->command_buffers;
+
+ const auto cbbi = VkCommandBufferBeginInfo{
+ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+ };
+ // Heads
+ vtable.ResetCommandBuffer(head, 0);
+ vtable.BeginCommandBuffer(head, &cbbi);
+ // Reset the next two and make them unavailable when they are run!
+ vtable.CmdResetQueryPool(head, this->query_pool, this->query_index, 2);
+ vtable.CmdWriteTimestamp2KHR(head, VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
+ this->query_pool, this->query_index);
+ vtable.EndCommandBuffer(head);
+
+ // Tails
+ vtable.ResetCommandBuffer(tail, 0);
+ vtable.BeginCommandBuffer(tail, &cbbi);
+ vtable.CmdWriteTimestamp2KHR(tail, VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
+ this->query_pool, this->query_index + 1);
+ vtable.EndCommandBuffer(tail);
+}
+
+void TimestampPool::poll() {
+ this->cached_timestamps.clear();
+ this->cached_timestamps.reserve(std::size(this->blocks));
+
+ std::ranges::transform(
+ this->blocks, std::back_inserter(this->cached_timestamps),
+ [&, this](const auto& block) {
+ const auto& query_pool = block.query_pool;
+
+ auto timestamps = std::make_unique<std::vector<std::uint64_t>>(
+ this->TIMESTAMP_QUERY_POOL_SIZE);
+
+ const auto result = vtable.GetQueryPoolResults(
+ this->device, query_pool, 0, this->TIMESTAMP_QUERY_POOL_SIZE,
+ this->TIMESTAMP_QUERY_POOL_SIZE * sizeof(std::uint64_t),
+ std::data(*timestamps), sizeof(uint64_t),
+ VK_QUERY_RESULT_64_BIT);
+
+ // Might return not ready when any of them aren't ready, which is
+ // not an error for our use case.
+ assert(result == VK_SUCCESS || result == VK_NOT_READY);
+
+ return timestamps;
+ });
+};
+
+std::uint64_t TimestampPool::get_polled(const Handle& handle) {
+
+ assert(handle.block_index < std::size(this->cached_timestamps));
+
+ const auto& cached_timestamp = this->cached_timestamps[handle.block_index];
+ assert(cached_timestamp != nullptr);
+ assert(std::size(*cached_timestamp) < handle.query_index);
+
+ return handle.query_index;
+}
+
+} // namespace low_latency \ No newline at end of file
diff --git a/src/timestamp_pool.hh b/src/timestamp_pool.hh
new file mode 100644
index 0000000..7efa4ee
--- /dev/null
+++ b/src/timestamp_pool.hh
@@ -0,0 +1,123 @@
+#ifndef TIMESTAMP_POOL_HH_
+#define TIMESTAMP_POOL_HH_
+
+// The purpose of this file is to provide the definition of a 'timestamp pool'.
+// It manages blocks of timestamp query pools, hands them out when requested,
+// and allocates more when (if) we run out. It also efficiently reads them back.
+// This class solves some key issues:
+//
+// 1. We need a potentially infinite amount of timestamps available to the
+// GPU. While I imagine most (good) applications will limit the amount of
+// times they call vkQueueSubmit, there's no bound we can place on the
+// amount of times this function will be called. Also,
+// the amount of frames in flight might vary, so really we need
+// num_queue_submits * max_frames_in_flight timestamps. Obviously, we don't
+// know what these numbers are at runtime and can't assume that they are
+// reasonable or even constant either. We solve this by allocating more
+// timestamps when necessary.
+
+// 2. We don't want to hammer vulkan with expensive timestamp read
+// operations. If we have hundreds of query pools lying around, reading them
+// back will take hundreds of individual vulkan calls. They
+// should be batched as to perform as few reads as possible. So if we allocate
+// multiple big query pool strips, then reading them will only require that many
+// calls. We then can cache off the result of reading as well so iterating
+// through later doesn't require any vulkan interaction at all.
+//
+//
+// Usage:
+// 1. Get handle with .acquire().
+// 2. Write start/end timestamp operations with the handle's pool and index
+// into the provided command buffer.
+// 3. With the command buffer signalled completion via some semaphore /
+// fence, call .poll(). This will cache off all outstanding handles.
+// Retrieving with handles which have not been signalled are undefined.
+// 4. Retrieve timestamp results with .get_polled(your_handle).
+// 5. Destruct the handle to return the key to the pool.
+
+#include <vulkan/utility/vk_dispatch_table.h>
+#include <vulkan/vulkan.hpp>
+
+#include <memory>
+#include <unordered_set>
+
+namespace low_latency {
+
+class TimestampPool final {
+ private:
+ static constexpr auto TIMESTAMP_QUERY_POOL_SIZE = 512u;
+ static_assert(TIMESTAMP_QUERY_POOL_SIZE % 2 == 0);
+
+ private:
+ VkuDeviceDispatchTable vtable;
+ VkDevice device;
+ VkCommandPool command_pool;
+
+ // VkQueryPool with an unordered set of keys available for reading.
+ using available_query_indicies_t = std::unordered_set<std::uint64_t>;
+
+ struct block {
+ VkQueryPool query_pool;
+ std::shared_ptr<available_query_indicies_t> available_indicies;
+ std::unique_ptr<std::vector<VkCommandBuffer>> command_buffers;
+ };
+ std::vector<block> blocks; // multiple blocks
+
+ // A snapshot of all available blocks for reading after each poll.
+ std::vector<std::unique_ptr<std::vector<std::uint64_t>>> cached_timestamps;
+
+ public:
+ // A handle represents two std::uint64_t blocks of timestamp memory and two
+ // command buffers.
+ struct Handle {
+ private:
+ friend class TimestampPool;
+
+ private:
+ std::weak_ptr<available_query_indicies_t> index_origin;
+ std::size_t block_index;
+
+ public:
+ VkQueryPool query_pool;
+ std::uint64_t query_index;
+ std::array<VkCommandBuffer, 2> command_buffers;
+
+ public:
+ Handle(const std::weak_ptr<TimestampPool::available_query_indicies_t>&
+ index_origin,
+ const std::size_t block_index, const VkQueryPool& query_pool,
+ const std::uint64_t query_index,
+ const std::array<VkCommandBuffer, 2>& command_buffers);
+ Handle(const Handle& handle) = delete;
+ Handle(Handle&&) = delete;
+ Handle operator==(const Handle& handle) = delete;
+ Handle operator==(Handle&&) = delete;
+ ~Handle(); // frees from the pool
+
+ public:
+ void setup_command_buffers(const VkuDeviceDispatchTable& vtable) const;
+ };
+
+ private:
+ block allocate();
+
+ public:
+ TimestampPool(const VkDevice& device, const VkuDeviceDispatchTable& vtable,
+ const VkCommandPool& command_pool);
+ TimestampPool(const TimestampPool&) = delete;
+ TimestampPool(TimestampPool&&) = delete;
+ TimestampPool operator==(const TimestampPool&) = delete;
+ TimestampPool operator==(TimestampPool&&) = delete;
+
+ public:
+ // Hands out a Handle with a pool and index of two uint64_t's.
+ std::unique_ptr<Handle> acquire();
+
+ void poll(); // saves the current state for future get's.
+
+ std::uint64_t get_polled(const Handle& handle);
+};
+
+} // namespace low_latency
+
+#endif \ No newline at end of file