#include "layer.hh" #include #include #include #include #include #include #include #include #include #include #include #include #include "device_context.hh" #include "instance_context.hh" #include "layer_context.hh" #include "queue_context.hh" #include "timestamp_pool.hh" namespace low_latency { namespace { LayerContext layer_context; } // namespace // Small templates which allow us to SFINAE find pNext structs. template static T* find_next(void* const head, const VkStructureType& stype) { for (auto i = reinterpret_cast(head)->pNext; i; i = i->pNext) { if (i->sType == stype) { return reinterpret_cast(i); } } return nullptr; } template static const T* find_next(const void* const head, const VkStructureType& stype) { for (auto i = reinterpret_cast(head)->pNext; i; i = i->pNext) { if (i->sType == stype) { return reinterpret_cast(i); } } return nullptr; } template static const T* find_link(const void* const head, const VkStructureType& stype) { for (auto info = find_next(head, stype); info; info = find_next(info, stype)) { if (info->function == VK_LAYER_LINK_INFO) { return reinterpret_cast(info); } } return nullptr; } static VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkInstance* pInstance) { const auto link_info = find_link( pCreateInfo, VK_STRUCTURE_TYPE_LOADER_INSTANCE_CREATE_INFO); if (!link_info || !link_info->u.pLayerInfo) { return VK_ERROR_INITIALIZATION_FAILED; } // Store our get instance proc addr function and pop it off our list + // advance the list so future layers know what to call. const auto gipa = link_info->u.pLayerInfo->pfnNextGetInstanceProcAddr; if (!gipa) { return VK_ERROR_INITIALIZATION_FAILED; } const_cast(link_info)->u.pLayerInfo = link_info->u.pLayerInfo->pNext; // Call our create instance func, and store vkDestroyInstance, and // vkCreateDevice as well. const auto create_instance = reinterpret_cast( gipa(VK_NULL_HANDLE, "vkCreateInstance")); if (!create_instance) { return VK_ERROR_INITIALIZATION_FAILED; } if (const auto result = create_instance(pCreateInfo, pAllocator, pInstance); result != VK_SUCCESS) { return result; } const auto key = layer_context.get_key(*pInstance); #define INSTANCE_VTABLE_LOAD(name) \ vtable.name = reinterpret_cast(gipa(*pInstance, "vk" #name)) auto vtable = VkuInstanceDispatchTable{}; INSTANCE_VTABLE_LOAD(DestroyInstance); INSTANCE_VTABLE_LOAD(EnumeratePhysicalDevices); INSTANCE_VTABLE_LOAD(GetPhysicalDeviceProperties); INSTANCE_VTABLE_LOAD(GetInstanceProcAddr); INSTANCE_VTABLE_LOAD(CreateDevice); INSTANCE_VTABLE_LOAD(EnumerateDeviceExtensionProperties); INSTANCE_VTABLE_LOAD(GetPhysicalDeviceQueueFamilyProperties2); INSTANCE_VTABLE_LOAD(GetPhysicalDeviceFeatures2); #undef INSTANCE_VTABLE_LOAD const auto lock = std::scoped_lock{layer_context.mutex}; assert(!layer_context.contexts.contains(key)); layer_context.contexts.try_emplace( key, std::make_shared(layer_context, *pInstance, std::move(vtable))); return VK_SUCCESS; } static VKAPI_ATTR void VKAPI_CALL DestroyInstance(VkInstance instance, const VkAllocationCallbacks* allocator) { const auto destroy_instance_func = [&]() -> auto { const auto context = layer_context.get_context(instance); const auto lock = std::scoped_lock{layer_context.mutex}; // Erase our physical devices owned by this instance from the global // context. for (const auto& [key, _] : context->phys_devices) { assert(layer_context.contexts.erase(key)); } const auto key = layer_context.get_key(instance); assert(layer_context.contexts.erase(key)); // Should be the last ptr now like DestroyDevice. assert(context.unique()); return context->vtable.DestroyInstance; }(); destroy_instance_func(instance, allocator); } static VKAPI_ATTR VkResult VKAPI_CALL EnumeratePhysicalDevices( VkInstance instance, std::uint32_t* count, VkPhysicalDevice* devices) { const auto context = layer_context.get_context(instance); if (const auto result = context->vtable.EnumeratePhysicalDevices(instance, count, devices); !devices || !count || result != VK_SUCCESS) { return result; } const auto lock = std::scoped_lock{layer_context.mutex}; for (const auto& device : std::span{devices, *count}) { const auto key = layer_context.get_key(device); const auto [iter, inserted] = layer_context.contexts.try_emplace(key, nullptr); if (inserted) { iter->second = std::make_shared(*context, device); } } return VK_SUCCESS; } static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice( VkPhysicalDevice physical_device, const VkDeviceCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDevice* pDevice) { const auto enabled_extensions = std::span{pCreateInfo->ppEnabledExtensionNames, pCreateInfo->enabledExtensionCount}; const auto requested = std::unordered_set( std::from_range, enabled_extensions); // There's the antilag extension that might be requested here - Antilag2. // Then there's the other thing we provide, which is our AntiLag1 // equivalent. Calling them AL1 and AL2, where AL1 is requested via // an env var and AL2 is requested at the device level via the extension, // the cases where we exit with a bad code or deliberately no-op are: // // !SUPPORTED && !AL2 && !AL1 -> No-op hooks // !SUPPORTED && !AL2 && AL1 -> No-op hooks // !SUPPORTED && AL2 && !AL1 -> VK_ERROR_INITIALIZATION_FAILED // !SUPPORTED && AL2 && AL1 -> VK_ERROR_INITIALIZATION_FAILED // SUPPORTED && !AL2 && !AL1 -> No-op hooks. // // Note that even though the user has explicitly enabled AL1 via an env var, // failing hard here by returning INIT_FAILED if the device doesn't support // it is wrong. The vulkan application could just be creating a device that // cannot support it which is unrelated to anything present related. This // is not the case with AL2, because the vulkan application has to // explicitly ask for the extension when it creates the device. const auto was_antilag_requested = requested.contains(VK_AMD_ANTI_LAG_EXTENSION_NAME); const auto context = layer_context.get_context(physical_device); if (!context->supports_required_extensions && was_antilag_requested) { return VK_ERROR_INITIALIZATION_FAILED; } const auto create_info = find_link( pCreateInfo, VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO); if (!create_info || !create_info->u.pLayerInfo) { return VK_ERROR_INITIALIZATION_FAILED; } const auto gdpa = create_info->u.pLayerInfo->pfnNextGetDeviceProcAddr; if (!gdpa) { return VK_ERROR_INITIALIZATION_FAILED; } const_cast(create_info)->u.pLayerInfo = create_info->u.pLayerInfo->pNext; // Build a next extensions vector from what they have requested. const auto next_extensions = [&]() -> std::vector { auto next_extensions = std::vector(std::from_range, enabled_extensions); // Don't append anything extra if we don't support what we need. if (!context->supports_required_extensions) { return next_extensions; } // Only append the extra extension if it wasn't already asked for. for (const auto& wanted : PhysicalDeviceContext::required_extensions) { if (requested.contains(wanted)) { continue; } next_extensions.push_back(wanted); } return next_extensions; }(); const auto next_create_info = [&]() -> VkDeviceCreateInfo { auto next_pCreateInfo = *pCreateInfo; next_pCreateInfo.ppEnabledExtensionNames = std::data(next_extensions); next_pCreateInfo.enabledExtensionCount = static_cast(std::size(next_extensions)); return next_pCreateInfo; }(); if (const auto result = context->instance.vtable.CreateDevice( physical_device, &next_create_info, pAllocator, pDevice); result != VK_SUCCESS) { return result; } #define DEVICE_VTABLE_LOAD(name) \ vtable.name = reinterpret_cast(gdpa(*pDevice, "vk" #name)) auto vtable = VkuDeviceDispatchTable{}; DEVICE_VTABLE_LOAD(GetDeviceProcAddr); DEVICE_VTABLE_LOAD(DestroyDevice); DEVICE_VTABLE_LOAD(GetDeviceQueue); DEVICE_VTABLE_LOAD(QueueSubmit); DEVICE_VTABLE_LOAD(CreateQueryPool); DEVICE_VTABLE_LOAD(DestroyQueryPool); DEVICE_VTABLE_LOAD(GetQueryPoolResults); DEVICE_VTABLE_LOAD(CreateCommandPool); DEVICE_VTABLE_LOAD(DestroyCommandPool); DEVICE_VTABLE_LOAD(AllocateCommandBuffers); DEVICE_VTABLE_LOAD(FreeCommandBuffers); DEVICE_VTABLE_LOAD(BeginCommandBuffer); DEVICE_VTABLE_LOAD(EndCommandBuffer); DEVICE_VTABLE_LOAD(ResetCommandBuffer); DEVICE_VTABLE_LOAD(CmdResetQueryPool); DEVICE_VTABLE_LOAD(GetDeviceQueue2); DEVICE_VTABLE_LOAD(QueueSubmit2); DEVICE_VTABLE_LOAD(AcquireNextImageKHR); DEVICE_VTABLE_LOAD(QueuePresentKHR); DEVICE_VTABLE_LOAD(AcquireNextImage2KHR); DEVICE_VTABLE_LOAD(CmdWriteTimestamp2KHR); DEVICE_VTABLE_LOAD(QueueSubmit2KHR); DEVICE_VTABLE_LOAD(GetCalibratedTimestampsKHR); DEVICE_VTABLE_LOAD(ResetQueryPoolEXT); #undef DEVICE_VTABLE_LOAD const auto key = layer_context.get_key(*pDevice); const auto lock = std::scoped_lock{layer_context.mutex}; assert(!layer_context.contexts.contains(key)); layer_context.contexts.try_emplace( key, std::make_shared(context->instance, *context, *pDevice, was_antilag_requested, std::move(vtable))); return VK_SUCCESS; } static VKAPI_ATTR void VKAPI_CALL DestroyDevice(VkDevice device, const VkAllocationCallbacks* allocator) { const auto destroy_device_func = [&]() -> auto { const auto device_context = layer_context.get_context(device); const auto func = device_context->vtable.DestroyDevice; const auto lock = std::scoped_lock{layer_context.mutex}; // Remove all owned queues from our global context pool. for (const auto& [queue, _] : device_context->queues) { const auto key = layer_context.get_key(queue); assert(layer_context.contexts.erase(key)); } const auto key = layer_context.get_key(device); assert(layer_context.contexts.erase(key)); // should be the last shared ptr now, so its destructor can be called. // the destructor should expect its owned queues to be unique as well! assert(device_context.unique()); return func; }(); destroy_device_func(device, allocator); } static VKAPI_ATTR void VKAPI_CALL GetDeviceQueue(VkDevice device, std::uint32_t queue_family_index, std::uint32_t queue_index, VkQueue* queue) { const auto context = layer_context.get_context(device); // Get device queue, unlike CreateDevice or CreateInstance, can be // called multiple times to return the same queue object. Our insertion // handling has to be a little different where we account for this. context->vtable.GetDeviceQueue(device, queue_family_index, queue_index, queue); if (!queue || !*queue) { return; } // Look in our layer context, which has everything. If we were able to // insert a nullptr key, then it didn't already exist so we should // construct a new one. const auto key = layer_context.get_key(*queue); const auto lock = std::scoped_lock{layer_context.mutex}; const auto [it, inserted] = layer_context.contexts.try_emplace(key); if (inserted) { it->second = std::make_shared(*context, *queue, queue_family_index); } // it->second should be QueueContext, also it might already be there. const auto ptr = std::dynamic_pointer_cast(it->second); assert(ptr); context->queues.emplace(*queue, ptr); } // Identical logic to gdq1. static VKAPI_ATTR void VKAPI_CALL GetDeviceQueue2( VkDevice device, const VkDeviceQueueInfo2* info, VkQueue* queue) { const auto context = layer_context.get_context(device); context->vtable.GetDeviceQueue2(device, info, queue); if (!queue || !*queue) { return; } const auto key = layer_context.get_key(*queue); const auto lock = std::scoped_lock{layer_context.mutex}; const auto [it, inserted] = layer_context.contexts.try_emplace(key); if (inserted) { it->second = std::make_shared(*context, *queue, info->queueFamilyIndex); } const auto ptr = std::dynamic_pointer_cast(it->second); assert(ptr); context->queues.emplace(*queue, ptr); } static VKAPI_ATTR VkResult VKAPI_CALL vkQueueSubmit(VkQueue queue, std::uint32_t submit_count, const VkSubmitInfo* submit_infos, VkFence fence) { const auto context = layer_context.get_context(queue); const auto& vtable = context->device_context.vtable; if (!submit_count || !context->should_inject_timestamps()) { return vtable.QueueSubmit(queue, submit_count, submit_infos, fence); } // What's happening here? // We are making a very modest modification to all vkQueueSubmits where we // inject a start and end timestamp query command buffer that writes when // the GPU started and finished work for each submission. Note, we do *NOT* // use or modify any semaphores as a mechanism to signal completion or the // availability of these submits for multiple reasons: // 1. Modifying semaphores (particuarly in vkQueueSubmit1) is ANNOYING // done correctly. The pNext chain is const and difficult to modify // without traversing the entire thing and doing surgical deep copies // and patches for multiple pNext's sType's. It's easier to leave it // alone. If we do edit them it's either a maintenance nightmare or // an illegal const cast timebomb that breaks valid vulkan // applications that pass truly read only vkSubmitInfo->pNext's. // 2. Semaphores only signal at the end of their work, so we cannot use // them as a mechanism to know if work has started without doing // another dummy submission. If we did this it adds complexity and // also might skew our timestamps slightly as they wouldn't be a part // of the submission which contained those command buffers. // 3. Timestamps support querying if their work has started/ended // as long as we use the vkHostQueryReset extension to reset them // before we consider them queryable. This means we don't need a // 'is it valid to query my timestamps' timeline semaphore. // 4. The performance impact of using semaphores vs timestamps is // negligible. using cbs_t = std::vector; auto next_submits = std::vector{}; // We're making modifications to multiple vkQueueSubmits. These have raw // pointers to our command buffer arrays - of which the position in memory // of can change on vector reallocation. So we use unique_ptrs here. auto next_cbs = std::vector>{}; // notify_submit() should take copies of these shared_ptrs and store // them for the duration of our call, but saving them here is a bit // more explicit + insurance if that changes. auto handles = std::vector>{}; const auto now = DeviceContext::Clock::now(); std::ranges::transform( std::span{submit_infos, submit_count}, std::back_inserter(next_submits), [&](const auto& submit) { const auto head_handle = context->timestamp_pool->acquire(); const auto tail_handle = context->timestamp_pool->acquire(); head_handle->setup_command_buffers(*tail_handle, *context); context->notify_submit(submit, head_handle, tail_handle, now); handles.emplace_back(head_handle); handles.emplace_back(tail_handle); next_cbs.emplace_back([&]() -> auto { auto cbs = std::make_unique(); cbs->push_back(head_handle->command_buffer); std::ranges::copy(std::span{submit.pCommandBuffers, submit.commandBufferCount}, std::back_inserter(*cbs)); cbs->push_back(tail_handle->command_buffer); return cbs; }()); auto next_submit = submit; next_submit.pCommandBuffers = std::data(*next_cbs.back()); next_submit.commandBufferCount = static_cast(std::size(*next_cbs.back())); return next_submit; }); return vtable.QueueSubmit( queue, static_cast(std::size(next_submits)), std::data(next_submits), fence); } // The logic for this function is identical to vkSubmitInfo. static VKAPI_ATTR VkResult VKAPI_CALL vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count, const VkSubmitInfo2* submit_infos, VkFence fence) { const auto context = layer_context.get_context(queue); const auto& vtable = context->device_context.vtable; if (!submit_count || !context->should_inject_timestamps()) { return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence); } using cbs_t = std::vector; auto next_submits = std::vector{}; auto next_cbs = std::vector>{}; auto handles = std::vector>{}; const auto now = DeviceContext::Clock::now(); std::ranges::transform( std::span{submit_infos, submit_count}, std::back_inserter(next_submits), [&](const auto& submit) { const auto head_handle = context->timestamp_pool->acquire(); const auto tail_handle = context->timestamp_pool->acquire(); head_handle->setup_command_buffers(*tail_handle, *context); context->notify_submit(submit, head_handle, tail_handle, now); handles.emplace_back(head_handle); handles.emplace_back(tail_handle); next_cbs.emplace_back([&]() -> auto { auto cbs = std::make_unique(); cbs->push_back(VkCommandBufferSubmitInfo{ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, .commandBuffer = head_handle->command_buffer, }); std::ranges::copy(std::span{submit.pCommandBufferInfos, submit.commandBufferInfoCount}, std::back_inserter(*cbs)); cbs->push_back(VkCommandBufferSubmitInfo{ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO, .commandBuffer = tail_handle->command_buffer, }); return cbs; }()); auto next_submit = submit; next_submit.pCommandBufferInfos = std::data(*next_cbs.back()); next_submit.commandBufferInfoCount = static_cast(std::size(*next_cbs.back())); return next_submit; }); return vtable.QueueSubmit2( queue, static_cast(std::size(next_submits)), std::data(next_submits), fence); } static VKAPI_ATTR VkResult VKAPI_CALL vkQueueSubmit2KHR(VkQueue queue, std::uint32_t submit_count, const VkSubmitInfo2* submit_info, VkFence fence) { // Just forward to low_latency::vkQueueSubmit2 here. return low_latency::vkQueueSubmit2(queue, submit_count, submit_info, fence); } static VKAPI_ATTR VkResult VKAPI_CALL vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) { const auto context = layer_context.get_context(queue); const auto& vtable = context->device_context.vtable; if (const auto res = vtable.QueuePresentKHR(queue, present_info); res != VK_SUCCESS) { return res; } context->notify_present(*present_info); return VK_SUCCESS; } static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceExtensionProperties( VkPhysicalDevice physical_device, const char* pLayerName, std::uint32_t* pPropertyCount, VkExtensionProperties* pProperties) { const auto context = layer_context.get_context(physical_device); const auto& vtable = context->instance.vtable; // Not asking about our layer - just forward it. if (!pLayerName || std::string_view{pLayerName} != LAYER_NAME) { return vtable.EnumerateDeviceExtensionProperties( physical_device, pLayerName, pPropertyCount, pProperties); } auto& count = *pPropertyCount; // !pProperties means they're querying how much space they need. if (!pProperties) { count = 1; return VK_SUCCESS; } if (!count) { return VK_INCOMPLETE; // They gave us zero space to work with. } pProperties[0] = VkExtensionProperties{.extensionName = VK_AMD_ANTI_LAG_EXTENSION_NAME, .specVersion = VK_AMD_ANTI_LAG_SPEC_VERSION}; count = 1; return VK_SUCCESS; } static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2( VkPhysicalDevice physical_device, VkPhysicalDeviceFeatures2* pFeatures) { const auto context = layer_context.get_context(physical_device); const auto& vtable = context->instance.vtable; vtable.GetPhysicalDeviceFeatures2(physical_device, pFeatures); const auto feature = find_next( pFeatures, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD); if (feature) { feature->antiLag = context->supports_required_extensions; } } static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2KHR( VkPhysicalDevice physical_device, VkPhysicalDeviceFeatures2KHR* pFeatures) { return low_latency::GetPhysicalDeviceFeatures2(physical_device, pFeatures); } static VKAPI_ATTR void VKAPI_CALL AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) { const auto context = layer_context.get_context(device); assert(pData); context->notify_antilag_update(*pData); } } // namespace low_latency // This is a bit of template hackery which generates a wrapper function for each // of our hooks that keeps exceptions from getting sucked back into the caller. // This is useful because it allows us to use exceptions and not violate the // vulkan contract. If we can return something, VK_ERROR_UNKNOWN is used - // otherwise we call terminate. template struct HookExceptionWrapper; template struct HookExceptionWrapper { static R call(Args... args) noexcept { // If the function is void, we don't need to think about anything and // can just call it. if constexpr (std::is_void_v) { Func(args...); return; } // If the function isn't void, we need to wrap it in a try block. Any // exceptions we get (which came from our layer) should handled here // by returning VK_ERROR_UNKNOWN when we can. Otherwise just terminate. try { return Func(args...); } catch (...) { if constexpr (std::is_same_v) { return VK_ERROR_UNKNOWN; } } std::terminate(); } }; #define HOOK_ENTRY(vk_name_literal, fn_sym) \ {vk_name_literal, reinterpret_cast( \ &HookExceptionWrapper::call)} using func_map_t = std::unordered_map; static const auto instance_functions = func_map_t{ HOOK_ENTRY("vkCreateDevice", low_latency::CreateDevice), HOOK_ENTRY("vkGetInstanceProcAddr", LowLatency_GetInstanceProcAddr), HOOK_ENTRY("vkGetDeviceProcAddr", LowLatency_GetDeviceProcAddr), HOOK_ENTRY("vkEnumeratePhysicalDevices", low_latency::EnumeratePhysicalDevices), HOOK_ENTRY("vkCreateInstance", low_latency::CreateInstance), HOOK_ENTRY("vkDestroyInstance", low_latency::DestroyInstance), HOOK_ENTRY("vkEnumerateDeviceExtensionProperties", low_latency::EnumerateDeviceExtensionProperties), HOOK_ENTRY("vkGetPhysicalDeviceFeatures2", low_latency::GetPhysicalDeviceFeatures2), HOOK_ENTRY("vkGetPhysicalDeviceFeatures2KHR", low_latency::GetPhysicalDeviceFeatures2KHR), }; static const auto device_functions = func_map_t{ HOOK_ENTRY("vkGetDeviceProcAddr", LowLatency_GetDeviceProcAddr), HOOK_ENTRY("vkDestroyDevice", low_latency::DestroyDevice), HOOK_ENTRY("vkGetDeviceQueue", low_latency::GetDeviceQueue), HOOK_ENTRY("vkGetDeviceQueue2", low_latency::GetDeviceQueue2), HOOK_ENTRY("vkQueueSubmit", low_latency::vkQueueSubmit), HOOK_ENTRY("vkQueueSubmit2", low_latency::vkQueueSubmit2), HOOK_ENTRY("vkQueueSubmit2KHR", low_latency::vkQueueSubmit2KHR), HOOK_ENTRY("vkQueuePresentKHR", low_latency::vkQueuePresentKHR), HOOK_ENTRY("vkAntiLagUpdateAMD", low_latency::AntiLagUpdateAMD), }; #undef HOOK_ENTRY VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL LowLatency_GetDeviceProcAddr(VkDevice device, const char* const pName) { if (!pName || !device) { return nullptr; } if (const auto it = device_functions.find(pName); it != std::end(device_functions)) { return it->second; } const auto context = low_latency::layer_context.get_context(device); return context->vtable.GetDeviceProcAddr(device, pName); } VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL LowLatency_GetInstanceProcAddr(VkInstance instance, const char* const pName) { if (const auto it = instance_functions.find(pName); it != std::end(instance_functions)) { return it->second; } const auto context = low_latency::layer_context.get_context(instance); return context->vtable.GetInstanceProcAddr(instance, pName); }