aboutsummaryrefslogtreecommitdiff
path: root/src/layer.cc
diff options
context:
space:
mode:
authorNicolas James <Eele1Ephe7uZahRie@tutanota.com>2026-03-30 15:54:10 +1100
committerNicolas James <Eele1Ephe7uZahRie@tutanota.com>2026-03-30 15:54:10 +1100
commit644bc4ed5edd4e3ffa88750bdacb147c75df9546 (patch)
tree5580f05b202493ccb68d5c67ec37e10d53f3cd1d /src/layer.cc
parent681bd5096ee416b50dd7338de30af7b3db385a36 (diff)
Fix AL2 via VK_NV_low_latency2's async implementation, fix race in TimestampPool
Diffstat (limited to 'src/layer.cc')
-rw-r--r--src/layer.cc132
1 files changed, 47 insertions, 85 deletions
diff --git a/src/layer.cc b/src/layer.cc
index 7a7ffc8..813c267 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -78,8 +78,6 @@ CreateInstance(const VkInstanceCreateInfo* pCreateInfo,
INSTANCE_VTABLE_LOAD(GetPhysicalDeviceQueueFamilyProperties2);
INSTANCE_VTABLE_LOAD(GetPhysicalDeviceFeatures2);
INSTANCE_VTABLE_LOAD(GetPhysicalDeviceSurfaceCapabilities2KHR);
- INSTANCE_VTABLE_LOAD(GetPhysicalDeviceProperties2);
- INSTANCE_VTABLE_LOAD(GetPhysicalDeviceProperties2KHR);
#undef INSTANCE_VTABLE_LOAD
const auto lock = std::scoped_lock{layer_context.mutex};
@@ -102,11 +100,13 @@ DestroyInstance(VkInstance instance, const VkAllocationCallbacks* allocator) {
// Erase our physical devices owned by this instance from the global
// context.
for (const auto& [key, _] : context->phys_devices) {
- assert(layer_context.contexts.erase(key));
+ assert(layer_context.contexts.contains(key));
+ layer_context.contexts.erase(key);
}
const auto key = layer_context.get_key(instance);
- assert(layer_context.contexts.erase(key));
+ assert(layer_context.contexts.contains(key));
+ layer_context.contexts.erase(key);
// Should be the last ptr now like DestroyDevice.
assert(context.unique());
@@ -154,23 +154,6 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
const auto requested = std::unordered_set<std::string_view>(
std::from_range, enabled_extensions);
- // There's the antilag extension that might be requested here - Antilag2.
- // Then there's the other thing we provide, which is our AntiLag1
- // equivalent. Calling them AL1 and AL2, where AL1 is requested via
- // an env var and AL2 is requested at the device level via the extension,
- // the cases where we exit with a bad code or deliberately no-op are:
- //
- // !SUPPORTED && !AL2 && AL1 -> No-op hooks
- // !AL2 && !AL1 -> No-op hooks.
- // !SUPPORTED && AL2 -> VK_ERROR_INITIALIZATION_FAILED
- //
- // Note that even though the user has explicitly enabled AL1 via an env var,
- // failing hard here by returning INIT_FAILED if the device doesn't support
- // it is wrong. The vulkan application could just be creating a device that
- // cannot support it which is unrelated to anything present related. This
- // is not the case with AL2, because the vulkan application has to
- // explicitly ask for the extension when it creates the device.
-
const auto was_capability_requested =
requested.contains(VK_AMD_ANTI_LAG_EXTENSION_NAME) ||
requested.contains(VK_NV_LOW_LATENCY_2_EXTENSION_NAME);
@@ -204,11 +187,9 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
// Only append the extra extension if it wasn't already asked for.
for (const auto& wanted : PhysicalDeviceContext::required_extensions) {
- if (requested.contains(wanted)) {
- continue;
+ if (!requested.contains(wanted)) {
+ next_extensions.push_back(wanted);
}
-
- next_extensions.push_back(wanted);
}
return next_extensions;
@@ -284,14 +265,16 @@ DestroyDevice(VkDevice device, const VkAllocationCallbacks* allocator) {
// Remove all owned queues from our global context pool.
for (const auto& [queue, _] : device_context->queues) {
const auto key = layer_context.get_key(queue);
- assert(layer_context.contexts.erase(key));
+ assert(layer_context.contexts.contains(key));
+ layer_context.contexts.erase(key);
}
const auto key = layer_context.get_key(device);
- assert(layer_context.contexts.erase(key));
+ assert(layer_context.contexts.contains(key));
+ layer_context.contexts.erase(key);
- // should be the last shared ptr now, so its destructor can be called.
- // the destructor should expect its owned queues to be unique as well!
+ // Should be the last shared ptr now, so its destructor can be called.
+ // The destructor should expect its owned queues to be unique as well.
assert(device_context.unique());
return func;
@@ -361,7 +344,7 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
const VkSubmitInfo* submit_infos, VkFence fence) {
const auto context = layer_context.get_context(queue);
- const auto& vtable = context->device_context.vtable;
+ const auto& vtable = context->device.vtable;
if (!submit_count || !context->should_inject_timestamps()) {
return vtable.QueueSubmit(queue, submit_count, submit_infos, fence);
@@ -447,7 +430,7 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
const VkSubmitInfo2* submit_infos, VkFence fence) {
const auto context = layer_context.get_context(queue);
- const auto& vtable = context->device_context.vtable;
+ const auto& vtable = context->device.vtable;
if (!submit_count || !context->should_inject_timestamps()) {
return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
@@ -511,7 +494,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL
vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) {
const auto context = layer_context.get_context(queue);
- const auto& vtable = context->device_context.vtable;
+ const auto& vtable = context->device.vtable;
if (const auto res = vtable.QueuePresentKHR(queue, present_info);
res != VK_SUCCESS) {
@@ -524,7 +507,11 @@ vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) {
for (auto i = std::uint32_t{0}; i < present_info->swapchainCount; ++i) {
const auto& swapchain = present_info->pSwapchains[i];
+
+ // For VK_AMD_anti_lag, providing a pPresentId isn't part of the spec.
+ // So we just set it to 0 if it isn't provided.
const auto present_id = pid ? pid->pPresentIds[i] : 0;
+
context->notify_present(swapchain, present_id);
}
@@ -549,9 +536,9 @@ static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceExtensionProperties(
physical_device, pLayerName, pPropertyCount, pProperties);
}
- // If we're spoofing nvidia we want to provide their extension instead.
+ // If we're exposing reflex we want to provide their extension instead.
const auto extension_properties = [&]() -> VkExtensionProperties {
- if (context->instance.layer.should_spoof_nvidia) {
+ if (context->instance.layer.should_expose_reflex) {
return {.extensionName = VK_NV_LOW_LATENCY_2_EXTENSION_NAME,
.specVersion = VK_NV_LOW_LATENCY_2_SPEC_VERSION};
}
@@ -561,13 +548,12 @@ static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceExtensionProperties(
if (pLayerName) {
// This query is for our layer specifically.
-
- if (!pProperties) { // Querying how much space they need.
+ if (!pProperties) {
*pPropertyCount = 1;
return VK_SUCCESS;
}
- if (!*pPropertyCount) { // They gave us zero space to work with.
+ if (!*pPropertyCount) {
return VK_INCOMPLETE;
}
@@ -618,8 +604,7 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2(
vtable.GetPhysicalDeviceFeatures2(physical_device, pFeatures);
// We're going to use this feature for both VK_AMD_anti_lag and
- // VK_NV_low_latency2. It simplifies things a bit if we share a code path
- // for now. TODO remove it in the future for VK_AMD_anti_lag.
+ // VK_NV_low_latency2. It simplifies things a bit if we share a code path.
if (const auto pidf = find_next<VkPhysicalDevicePresentIdFeaturesKHR>(
pFeatures,
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENT_ID_FEATURES_KHR);
@@ -628,10 +613,10 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2(
pidf->presentId = true;
}
- // Don't provide AntiLag if we're trying to spoof nvidia.
- // Nvidia uses VkSurfaceCapabilities2KHR to determine if a surface
- // is capable of reflex instead of AMD's physical device switch found here.
- if (context->instance.layer.should_spoof_nvidia) {
+ // Don't provide AntiLag if we're exposing reflex - VK_NV_low_latency2 uses
+ // VkSurfaceCapabilities2KHR to determine if a surface is capable of reflex
+ // instead of AMD's physical device switch found here.
+ if (context->instance.layer.should_expose_reflex) {
return;
}
@@ -649,29 +634,6 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2KHR(
return GetPhysicalDeviceFeatures2(physical_device, pFeatures);
}
-static VKAPI_ATTR void VKAPI_CALL
-GetPhysicalDeviceProperties2(VkPhysicalDevice physical_device,
- VkPhysicalDeviceProperties2* pProperties) {
-
- const auto context = layer_context.get_context(physical_device);
- const auto& vtable = context->instance.vtable;
-
- vtable.GetPhysicalDeviceProperties2(physical_device, pProperties);
-
- constexpr auto NVIDIA_VENDOR_ID = 0x10DE;
- constexpr auto NVIDIA_DEVICE_ID = 0x2684; // rtx 4080 i think?
- if (context->instance.layer.should_spoof_nvidia) {
- pProperties->properties.vendorID = NVIDIA_VENDOR_ID;
- pProperties->properties.deviceID = NVIDIA_DEVICE_ID;
- }
-}
-
-static VKAPI_ATTR void VKAPI_CALL
-GetPhysicalDeviceProperties2KHR(VkPhysicalDevice physical_device,
- VkPhysicalDeviceProperties2* pProperties) {
- return GetPhysicalDeviceProperties2(physical_device, pProperties);
-}
-
static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR(
VkPhysicalDevice physical_device,
const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo,
@@ -684,7 +646,7 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR(
physical_device, pSurfaceInfo, pSurfaceCapabilities);
// Don't do this unless we're spoofing nvidia.
- if (!context->instance.layer.should_spoof_nvidia) {
+ if (!context->instance.layer.should_expose_reflex) {
return;
}
@@ -742,8 +704,10 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateSwapchainKHR(
was_low_latency_requested = slci->latencyModeEnable;
}
- context->swapchain_monitors.try_emplace(*pSwapchain, *context,
- was_low_latency_requested);
+
+ const auto [_, did_emplace] = context->swapchain_monitors.try_emplace(
+ *pSwapchain, *context, was_low_latency_requested);
+ assert(did_emplace);
return VK_SUCCESS;
}
@@ -753,7 +717,8 @@ DestroySwapchainKHR(VkDevice device, VkSwapchainKHR swapchain,
const VkAllocationCallbacks* pAllocator) {
const auto context = layer_context.get_context(device);
- assert(context->swapchain_monitors.erase(swapchain));
+ assert(context->swapchain_monitors.contains(swapchain));
+ context->swapchain_monitors.erase(swapchain);
context->vtable.DestroySwapchainKHR(device, swapchain, pAllocator);
}
@@ -765,9 +730,8 @@ AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) {
// AL2 is a synchronous while NVIDIA's low_latencty2 is asynchronous.
// It's difficult to model an asynchronous impl inside a synchronous impl,
- // but it's easy to do the inverse. As a result, we should implement
- // NVIDIA's method and then have a working AL2 implementation follow using
- // that existing code path.
+ // but it's easy to do the inverse. AMD's extension piggybacks on NVIDIA's
+ // more complicated implementation.
const auto present_delay = [&]() {
using namespace std::chrono;
@@ -777,12 +741,18 @@ AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) {
context->update_params(std::nullopt, present_delay,
(pData->mode == VK_ANTI_LAG_MODE_ON_AMD));
- if (!pData->pPresentationInfo) {
+ if (!pData->pPresentationInfo ||
+ pData->pPresentationInfo->stage != VK_ANTI_LAG_STAGE_INPUT_AMD) {
+
return;
}
- if (pData->pPresentationInfo->stage == VK_ANTI_LAG_STAGE_INPUT_AMD) {
- // TODO use nvidia's path
+ // VK_AMD_anti_lag doesn't provide a swapchain, so we can't map it to
+ // a queue. Our previous implementation used the last queue that presented
+ // and made sure that at least that one completed. I think it's more robust
+ // to make sure they all complete.
+ for (auto& iter : context->swapchain_monitors) {
+ iter.second.wait_until();
}
}
@@ -832,21 +802,18 @@ VkResult SetLatencySleepModeNV(VkDevice device, VkSwapchainKHR swapchain,
// If pSleepModeInfo is nullptr, it means no delay and no low latency.
context->update_params(swapchain, std::chrono::milliseconds{0}, false);
}
+
return VK_SUCCESS;
}
void SetLatencyMarkerNV(VkDevice device, VkSwapchainKHR swapchain,
const VkSetLatencyMarkerInfoNV* pLatencyMarkerInfo) {
// STUB
- // We will probably end up making use of this in the future, but afaict it's
- // not relevant for this layer's operation just yet. This function is
- // NVIDIA's way of giving developers insight into their render pipeline.
}
void GetLatencyTimingsNV(VkDevice device, VkSwapchainKHR swapchain,
VkGetLatencyMarkerInfoNV* pLatencyMarkerInfo) {
// STUB
- // Just like SetLatencyMarkerNV this isn't relevant for us just yet.
}
} // namespace low_latency
@@ -907,11 +874,6 @@ static const auto instance_functions = func_map_t{
HOOK_ENTRY("vkGetPhysicalDeviceSurfaceCapabilities2KHR",
low_latency::GetPhysicalDeviceSurfaceCapabilities2KHR),
-
- HOOK_ENTRY("vkGetPhysicalDeviceProperties2",
- low_latency::GetPhysicalDeviceProperties2),
- HOOK_ENTRY("vkGetPhysicalDeviceProperties2KHR",
- low_latency::GetPhysicalDeviceProperties2KHR),
};
static const auto device_functions = func_map_t{