From 644bc4ed5edd4e3ffa88750bdacb147c75df9546 Mon Sep 17 00:00:00 2001
From: Nicolas James <Eele1Ephe7uZahRie@tutanota.com>
Date: Mon, 30 Mar 2026 15:54:10 +1100
Subject: Fix AL2 via VK_NV_low_latency2's async implementation, fix race in
 TimestampPool

---
 src/context.hh                 |   5 --
 src/device_clock.cc            |   3 +-
 src/device_context.cc          |  34 -----------
 src/device_context.hh          |   9 +--
 src/helper.hh                  |   5 ++
 src/layer.cc                   | 132 +++++++++++++++--------------------------
 src/layer_context.cc           |   7 +--
 src/layer_context.hh           |  15 ++---
 src/physical_device_context.cc |  14 ++---
 src/physical_device_context.hh |  10 +---
 src/queue_context.cc           |  36 +++++------
 src/queue_context.hh           |   8 +--
 src/swapchain_monitor.cc       |  18 +++++-
 src/swapchain_monitor.hh       |   4 ++
 src/timestamp_pool.cc          |  50 ++++++++++------
 src/timestamp_pool.hh          |  13 ++--
 16 files changed, 157 insertions(+), 206 deletions(-)
diff --git a/src/context.hh b/src/context.hh
index 6524984..718b697 100644
--- a/src/context.hh
+++ b/src/context.hh
@@ -3,11 +3,6 @@
 
 namespace low_latency {
 
-#define THROW_NON_VKSUCCESS(x)                                                 \
-    if (const auto result = x; result != VK_SUCCESS) {                         \
-        throw result;                                                          \
-    }
-
 // A context class doesn't do much by itself. We just use it to provide a
 // virtual destructor so we can store a bunch of shared_ptrs in the same
 // container and rely on RTTI in the layer context. It also deletes the copy and
diff --git a/src/device_clock.cc b/src/device_clock.cc
index 52c86d3..8e0e408 100644
--- a/src/device_clock.cc
+++ b/src/device_clock.cc
@@ -1,5 +1,6 @@
 #include "device_clock.hh"
 #include "device_context.hh"
+#include "helper.hh"
 
 #include <vulkan/vulkan_core.h>
 
@@ -37,7 +38,7 @@ void DeviceClock::calibrate() {
     };
     auto calibrated_result = CalibratedResult{};
 
-    THROW_NON_VKSUCCESS(device.vtable.GetCalibratedTimestampsKHR(
+    THROW_NOT_VKSUCCESS(device.vtable.GetCalibratedTimestampsKHR(
         device.device, 2, std::data(infos), &calibrated_result.device,
         &this->error_bound));
 
diff --git a/src/device_context.cc b/src/device_context.cc
index 5438e40..e2f2a4a 100644
--- a/src/device_context.cc
+++ b/src/device_context.cc
@@ -28,40 +28,6 @@ DeviceContext::~DeviceContext() {
     }
 }
 
-/*
-void DeviceContext::sleep_in_input() {
-    // TODO
-
-    // Present hasn't happened yet, we don't know what queue to attack.
-    if (!this->present_queue) {
-        return;
-    }
-
-    const auto& frames = this->present_queue->in_flight_frames;
-    // No frame here means we're behind the GPU and do not need to delay.
-    // If anything we should speed up...
-    if (!std::size(frames)) {
-        return;
-    }
-
-    // If we're here, that means that there might be an outstanding frame that's
-    // sitting on our present_queue which hasn't yet completed, so we need to
-    // stall until it's finished.
-    const auto& last_frame = frames.back();
-    assert(std::size(last_frame.submissions));
-    const auto& last_frame_submission = last_frame.submissions.back();
-    last_frame_submission->end_handle->get_time_spinlock();
-
-    // From our sleep in present implementation, just spinning until
-    // the previous frame has completed did not work well. This was because
-    // there was a delay between presentation and when new work was given
-    // to the GPU. If we stalled the CPU without trying to account for this, we
-    // would get huge frame drops, loss of throughput, and the GPU would even
-    // clock down. So naturally I am concerned about this approach, but it seems
-    // to perform well so far in my own testing and is just beautifully elegant.
-}
-*/
-
 void DeviceContext::update_params(
     const std::optional<VkSwapchainKHR> target,
     const std::chrono::milliseconds& present_delay,
diff --git a/src/device_context.hh b/src/device_context.hh
index 172801c..0e0a4eb 100644
--- a/src/device_context.hh
+++ b/src/device_context.hh
@@ -24,7 +24,8 @@ class DeviceContext final : public Context {
     InstanceContext& instance;
     PhysicalDeviceContext& physical_device;
 
-    // Whether or not we were asked to do NV_VK_LowLatency2 or VK_AMD_anti_lag.
+    // Whether or not we were asked to do NV_VK_LowLatency2 or VK_AMD_anti_lag
+    // at the device level.
     const bool was_capability_requested;
 
     const VkDevice device;
@@ -39,13 +40,13 @@ class DeviceContext final : public Context {
   public:
     DeviceContext(InstanceContext& parent_instance,
                   PhysicalDeviceContext& parent_physical,
-                  const VkDevice& device, const bool was_antilag_requested,
+                  const VkDevice& device, const bool was_capability_requested,
                   VkuDeviceDispatchTable&& vtable);
     virtual ~DeviceContext();
 
   public:
-    // Updates the settings associated with that swapchain. If none is provided
-    // all swapchains are set to this value.
+    // Updates the settings associated with that swapchain. If no swapchain
+    // target is provided all swapchains are set to this value.
     void update_params(const std::optional<VkSwapchainKHR> target,
                        const std::chrono::milliseconds& present_delay,
                        const bool was_low_latency_requested);
diff --git a/src/helper.hh b/src/helper.hh
index 468f146..6dde9be 100644
--- a/src/helper.hh
+++ b/src/helper.hh
@@ -8,6 +8,11 @@
 
 namespace low_latency {
 
+#define THROW_NOT_VKSUCCESS(x)                                                 \
+    if (const auto result = x; result != VK_SUCCESS) {                         \
+        throw result;                                                          \
+    }
+
 // Small templates which allow us to SFINAE find pNext structs.
 template <typename T>
 static T* find_next(void* const head, const VkStructureType& stype) {
diff --git a/src/layer.cc b/src/layer.cc
index 7a7ffc8..813c267 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -78,8 +78,6 @@ CreateInstance(const VkInstanceCreateInfo* pCreateInfo,
     INSTANCE_VTABLE_LOAD(GetPhysicalDeviceQueueFamilyProperties2);
     INSTANCE_VTABLE_LOAD(GetPhysicalDeviceFeatures2);
     INSTANCE_VTABLE_LOAD(GetPhysicalDeviceSurfaceCapabilities2KHR);
-    INSTANCE_VTABLE_LOAD(GetPhysicalDeviceProperties2);
-    INSTANCE_VTABLE_LOAD(GetPhysicalDeviceProperties2KHR);
 #undef INSTANCE_VTABLE_LOAD
 
     const auto lock = std::scoped_lock{layer_context.mutex};
@@ -102,11 +100,13 @@ DestroyInstance(VkInstance instance, const VkAllocationCallbacks* allocator) {
         // Erase our physical devices owned by this instance from the global
         // context.
         for (const auto& [key, _] : context->phys_devices) {
-            assert(layer_context.contexts.erase(key));
+            assert(layer_context.contexts.contains(key));
+            layer_context.contexts.erase(key);
         }
 
         const auto key = layer_context.get_key(instance);
-        assert(layer_context.contexts.erase(key));
+        assert(layer_context.contexts.contains(key));
+        layer_context.contexts.erase(key);
 
         // Should be the last ptr now like DestroyDevice.
         assert(context.unique());
@@ -154,23 +154,6 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
     const auto requested = std::unordered_set<std::string_view>(
         std::from_range, enabled_extensions);
 
-    // There's the antilag extension that might be requested here - Antilag2.
-    // Then there's the other thing we provide, which is our AntiLag1
-    // equivalent. Calling them AL1 and AL2, where AL1 is requested via
-    // an env var and AL2 is requested at the device level via the extension,
-    // the cases where we exit with a bad code or deliberately no-op are:
-    //
-    //     !SUPPORTED && !AL2 &&  AL1          -> No-op hooks
-    //                   !AL2 && !AL1          -> No-op hooks.
-    //     !SUPPORTED &&  AL2                  -> VK_ERROR_INITIALIZATION_FAILED
-    //
-    // Note that even though the user has explicitly enabled AL1 via an env var,
-    // failing hard here by returning INIT_FAILED if the device doesn't support
-    // it is wrong. The vulkan application could just be creating a device that
-    // cannot support it which is unrelated to anything present related. This
-    // is not the case with AL2, because the vulkan application has to
-    // explicitly ask for the extension when it creates the device.
-
     const auto was_capability_requested =
         requested.contains(VK_AMD_ANTI_LAG_EXTENSION_NAME) ||
         requested.contains(VK_NV_LOW_LATENCY_2_EXTENSION_NAME);
@@ -204,11 +187,9 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
 
         // Only append the extra extension if it wasn't already asked for.
         for (const auto& wanted : PhysicalDeviceContext::required_extensions) {
-            if (requested.contains(wanted)) {
-                continue;
+            if (!requested.contains(wanted)) {
+                next_extensions.push_back(wanted);
             }
-
-            next_extensions.push_back(wanted);
         }
 
         return next_extensions;
@@ -284,14 +265,16 @@ DestroyDevice(VkDevice device, const VkAllocationCallbacks* allocator) {
         // Remove all owned queues from our global context pool.
         for (const auto& [queue, _] : device_context->queues) {
             const auto key = layer_context.get_key(queue);
-            assert(layer_context.contexts.erase(key));
+            assert(layer_context.contexts.contains(key));
+            layer_context.contexts.erase(key);
         }
 
         const auto key = layer_context.get_key(device);
-        assert(layer_context.contexts.erase(key));
+        assert(layer_context.contexts.contains(key));
+        layer_context.contexts.erase(key);
 
-        // should be the last shared ptr now, so its destructor can be called.
-        // the destructor should expect its owned queues to be unique as well!
+        // Should be the last shared ptr now, so its destructor can be called.
+        // The destructor should expect its owned queues to be unique as well.
         assert(device_context.unique());
 
         return func;
@@ -361,7 +344,7 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
               const VkSubmitInfo* submit_infos, VkFence fence) {
 
     const auto context = layer_context.get_context(queue);
-    const auto& vtable = context->device_context.vtable;
+    const auto& vtable = context->device.vtable;
 
     if (!submit_count || !context->should_inject_timestamps()) {
         return vtable.QueueSubmit(queue, submit_count, submit_infos, fence);
@@ -447,7 +430,7 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
                const VkSubmitInfo2* submit_infos, VkFence fence) {
 
     const auto context = layer_context.get_context(queue);
-    const auto& vtable = context->device_context.vtable;
+    const auto& vtable = context->device.vtable;
 
     if (!submit_count || !context->should_inject_timestamps()) {
         return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
@@ -511,7 +494,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL
 vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) {
 
     const auto context = layer_context.get_context(queue);
-    const auto& vtable = context->device_context.vtable;
+    const auto& vtable = context->device.vtable;
 
     if (const auto res = vtable.QueuePresentKHR(queue, present_info);
         res != VK_SUCCESS) {
@@ -524,7 +507,11 @@ vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) {
 
     for (auto i = std::uint32_t{0}; i < present_info->swapchainCount; ++i) {
         const auto& swapchain = present_info->pSwapchains[i];
+
+        // For VK_AMD_anti_lag, providing a pPresentId isn't part of the spec.
+        // So we just set it to 0 if it isn't provided.
         const auto present_id = pid ? pid->pPresentIds[i] : 0;
+
         context->notify_present(swapchain, present_id);
     }
 
@@ -549,9 +536,9 @@ static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceExtensionProperties(
             physical_device, pLayerName, pPropertyCount, pProperties);
     }
 
-    // If we're spoofing nvidia we want to provide their extension instead.
+    // If we're exposing reflex we want to provide their extension instead.
     const auto extension_properties = [&]() -> VkExtensionProperties {
-        if (context->instance.layer.should_spoof_nvidia) {
+        if (context->instance.layer.should_expose_reflex) {
             return {.extensionName = VK_NV_LOW_LATENCY_2_EXTENSION_NAME,
                     .specVersion = VK_NV_LOW_LATENCY_2_SPEC_VERSION};
         }
@@ -561,13 +548,12 @@ static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceExtensionProperties(
 
     if (pLayerName) {
         // This query is for our layer specifically.
-
-        if (!pProperties) { // Querying how much space they need.
+        if (!pProperties) {
             *pPropertyCount = 1;
             return VK_SUCCESS;
         }
 
-        if (!*pPropertyCount) { // They gave us zero space to work with.
+        if (!*pPropertyCount) {
             return VK_INCOMPLETE;
         }
 
@@ -618,8 +604,7 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2(
     vtable.GetPhysicalDeviceFeatures2(physical_device, pFeatures);
 
     // We're going to use this feature for both VK_AMD_anti_lag and
-    // VK_NV_low_latency2. It simplifies things a bit if we share a code path
-    // for now. TODO remove it in the future for VK_AMD_anti_lag.
+    // VK_NV_low_latency2. It simplifies things a bit if we share a code path.
     if (const auto pidf = find_next<VkPhysicalDevicePresentIdFeaturesKHR>(
             pFeatures,
             VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENT_ID_FEATURES_KHR);
@@ -628,10 +613,10 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2(
         pidf->presentId = true;
     }
 
-    // Don't provide AntiLag if we're trying to spoof nvidia.
-    // Nvidia uses VkSurfaceCapabilities2KHR to determine if a surface
-    // is capable of reflex instead of AMD's physical device switch found here.
-    if (context->instance.layer.should_spoof_nvidia) {
+    // Don't provide AntiLag if we're exposing reflex - VK_NV_low_latency2 uses
+    // VkSurfaceCapabilities2KHR to determine if a surface is capable of reflex
+    // instead of AMD's physical device switch found here.
+    if (context->instance.layer.should_expose_reflex) {
         return;
     }
 
@@ -649,29 +634,6 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2KHR(
     return GetPhysicalDeviceFeatures2(physical_device, pFeatures);
 }
 
-static VKAPI_ATTR void VKAPI_CALL
-GetPhysicalDeviceProperties2(VkPhysicalDevice physical_device,
-                             VkPhysicalDeviceProperties2* pProperties) {
-
-    const auto context = layer_context.get_context(physical_device);
-    const auto& vtable = context->instance.vtable;
-
-    vtable.GetPhysicalDeviceProperties2(physical_device, pProperties);
-
-    constexpr auto NVIDIA_VENDOR_ID = 0x10DE;
-    constexpr auto NVIDIA_DEVICE_ID = 0x2684; // rtx 4080 i think?
-    if (context->instance.layer.should_spoof_nvidia) {
-        pProperties->properties.vendorID = NVIDIA_VENDOR_ID;
-        pProperties->properties.deviceID = NVIDIA_DEVICE_ID;
-    }
-}
-
-static VKAPI_ATTR void VKAPI_CALL
-GetPhysicalDeviceProperties2KHR(VkPhysicalDevice physical_device,
-                                VkPhysicalDeviceProperties2* pProperties) {
-    return GetPhysicalDeviceProperties2(physical_device, pProperties);
-}
-
 static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR(
     VkPhysicalDevice physical_device,
     const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo,
@@ -684,7 +646,7 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR(
         physical_device, pSurfaceInfo, pSurfaceCapabilities);
 
     // Don't do this unless we're spoofing nvidia.
-    if (!context->instance.layer.should_spoof_nvidia) {
+    if (!context->instance.layer.should_expose_reflex) {
         return;
     }
 
@@ -742,8 +704,10 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateSwapchainKHR(
 
         was_low_latency_requested = slci->latencyModeEnable;
     }
-    context->swapchain_monitors.try_emplace(*pSwapchain, *context,
-                                            was_low_latency_requested);
+
+    const auto [_, did_emplace] = context->swapchain_monitors.try_emplace(
+        *pSwapchain, *context, was_low_latency_requested);
+    assert(did_emplace);
 
     return VK_SUCCESS;
 }
@@ -753,7 +717,8 @@ DestroySwapchainKHR(VkDevice device, VkSwapchainKHR swapchain,
                     const VkAllocationCallbacks* pAllocator) {
     const auto context = layer_context.get_context(device);
 
-    assert(context->swapchain_monitors.erase(swapchain));
+    assert(context->swapchain_monitors.contains(swapchain));
+    context->swapchain_monitors.erase(swapchain);
 
     context->vtable.DestroySwapchainKHR(device, swapchain, pAllocator);
 }
@@ -765,9 +730,8 @@ AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) {
 
     // AL2 is a synchronous while NVIDIA's low_latencty2 is asynchronous.
     // It's difficult to model an asynchronous impl inside a synchronous impl,
-    // but it's easy to do the inverse. As a result, we should implement
-    // NVIDIA's method and then have a working AL2 implementation follow using
-    // that existing code path.
+    // but it's easy to do the inverse. AMD's extension piggybacks on NVIDIA's
+    // more complicated implementation.
 
     const auto present_delay = [&]() {
         using namespace std::chrono;
@@ -777,12 +741,18 @@ AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) {
     context->update_params(std::nullopt, present_delay,
                            (pData->mode == VK_ANTI_LAG_MODE_ON_AMD));
 
-    if (!pData->pPresentationInfo) {
+    if (!pData->pPresentationInfo ||
+        pData->pPresentationInfo->stage != VK_ANTI_LAG_STAGE_INPUT_AMD) {
+
         return;
     }
 
-    if (pData->pPresentationInfo->stage == VK_ANTI_LAG_STAGE_INPUT_AMD) {
-        // TODO use nvidia's path
+    // VK_AMD_anti_lag doesn't provide a swapchain, so we can't map it to
+    // a queue. Our previous implementation used the last queue that presented
+    // and made sure that at least that one completed. I think it's more robust
+    // to make sure they all complete.
+    for (auto& iter : context->swapchain_monitors) {
+        iter.second.wait_until();
     }
 }
 
@@ -832,21 +802,18 @@ VkResult SetLatencySleepModeNV(VkDevice device, VkSwapchainKHR swapchain,
         // If pSleepModeInfo is nullptr, it means no delay and no low latency.
         context->update_params(swapchain, std::chrono::milliseconds{0}, false);
     }
+
     return VK_SUCCESS;
 }
 
 void SetLatencyMarkerNV(VkDevice device, VkSwapchainKHR swapchain,
                         const VkSetLatencyMarkerInfoNV* pLatencyMarkerInfo) {
     // STUB
-    // We will probably end up making use of this in the future, but afaict it's
-    // not relevant for this layer's operation just yet. This function is
-    // NVIDIA's way of giving developers insight into their render pipeline.
 }
 
 void GetLatencyTimingsNV(VkDevice device, VkSwapchainKHR swapchain,
                          VkGetLatencyMarkerInfoNV* pLatencyMarkerInfo) {
     // STUB
-    // Just like SetLatencyMarkerNV this isn't relevant for us just yet.
 }
 
 } // namespace low_latency
@@ -907,11 +874,6 @@ static const auto instance_functions = func_map_t{
 
     HOOK_ENTRY("vkGetPhysicalDeviceSurfaceCapabilities2KHR",
                low_latency::GetPhysicalDeviceSurfaceCapabilities2KHR),
-
-    HOOK_ENTRY("vkGetPhysicalDeviceProperties2",
-               low_latency::GetPhysicalDeviceProperties2),
-    HOOK_ENTRY("vkGetPhysicalDeviceProperties2KHR",
-               low_latency::GetPhysicalDeviceProperties2KHR),
 };
 
 static const auto device_functions = func_map_t{
diff --git a/src/layer_context.cc b/src/layer_context.cc
index 4699202..4399338 100644
--- a/src/layer_context.cc
+++ b/src/layer_context.cc
@@ -4,15 +4,14 @@
 #include <string_view>
 
 namespace low_latency {
-    
+
 LayerContext::LayerContext() {
     const auto parse_bool_env = [](const auto& name) -> bool {
         const auto env = std::getenv(name);
         return env && std::string_view{env} == "1";
     };
-    
-    this->is_antilag_1_enabled = parse_bool_env(SLEEP_AFTER_PRESENT_ENV);
-    this->should_spoof_nvidia = parse_bool_env(SPOOF_NVIDIA_ENV);
+
+    this->should_expose_reflex = parse_bool_env(EXPOSE_REFLEX_ENV);
 }
 
 LayerContext::~LayerContext() {}
diff --git a/src/layer_context.hh b/src/layer_context.hh
index 95f1cd5..731b273 100644
--- a/src/layer_context.hh
+++ b/src/layer_context.hh
@@ -48,22 +48,15 @@ using dispatch_context_t = typename context_for_t<D>::context;
 
 class LayerContext final : public Context {
   private:
-    // If this is not null and set to exactly 1, then we should sleep after
-    // present.
-    static constexpr auto SLEEP_AFTER_PRESENT_ENV =
-        "LOW_LATENCY_LAYER_SLEEP_AFTER_PRESENT";
-
-    // If this is not null and set to exactly 1, then VK_NV_low_latency2
-    // should be provided instead of VK_AMD_anti_lag.
-    static constexpr auto SPOOF_NVIDIA_ENV =
-        "LOW_LATENCY_LAYER_SPOOF_NVIDIA";
+    // If this is not null and set to 1 then VK_NV_low_latency2 should be
+    // provided instead of VK_AMD_anti_lag.
+    static constexpr auto EXPOSE_REFLEX_ENV = "LOW_LATENCY_LAYER_EXPOSE_REFLEX";
 
   public:
     std::mutex mutex;
     std::unordered_map<void*, std::shared_ptr<Context>> contexts;
 
-    bool is_antilag_1_enabled = false;
-    bool should_spoof_nvidia = false;
+    bool should_expose_reflex = false;
 
   public:
     LayerContext();
diff --git a/src/physical_device_context.cc b/src/physical_device_context.cc
index 9c4ad8e..86bf9ab 100644
--- a/src/physical_device_context.cc
+++ b/src/physical_device_context.cc
@@ -1,4 +1,5 @@
 #include "physical_device_context.hh"
+#include "helper.hh"
 
 #include <vulkan/vulkan_core.h>
 
@@ -26,25 +27,24 @@ PhysicalDeviceContext::PhysicalDeviceContext(
         vtable.GetPhysicalDeviceQueueFamilyProperties2(physical_device, &count,
                                                        nullptr);
 
-        using qp_t = PhysicalDeviceContext::queue_properties_t;
-        auto result = qp_t(
+        auto result = std::vector<VkQueueFamilyProperties2>(
             count, VkQueueFamilyProperties2{
                        .sType = VK_STRUCTURE_TYPE_QUEUE_FAMILY_PROPERTIES_2});
         vtable.GetPhysicalDeviceQueueFamilyProperties2(physical_device, &count,
                                                        std::data(result));
 
-        return std::make_unique<qp_t>(std::move(result));
+        return std::make_unique<std::vector<VkQueueFamilyProperties2>>(
+            std::move(result));
     }();
 
     this->supports_required_extensions = [&]() {
         auto count = std::uint32_t{};
-        THROW_NON_VKSUCCESS(vtable.EnumerateDeviceExtensionProperties(
+        THROW_NOT_VKSUCCESS(vtable.EnumerateDeviceExtensionProperties(
             physical_device, nullptr, &count, nullptr));
 
         auto supported_extensions = std::vector<VkExtensionProperties>(count);
-        THROW_NON_VKSUCCESS(vtable.EnumerateDeviceExtensionProperties(
-            physical_device, nullptr, &count,
-            std::data(supported_extensions)));
+        THROW_NOT_VKSUCCESS(vtable.EnumerateDeviceExtensionProperties(
+            physical_device, nullptr, &count, std::data(supported_extensions)));
 
         const auto supported =
             supported_extensions |
diff --git a/src/physical_device_context.hh b/src/physical_device_context.hh
index f7ad289..d2e094e 100644
--- a/src/physical_device_context.hh
+++ b/src/physical_device_context.hh
@@ -23,16 +23,12 @@ class PhysicalDeviceContext final : public Context {
 
   public:
     InstanceContext& instance;
-
     const VkPhysicalDevice physical_device;
 
-    std::unique_ptr<const VkPhysicalDeviceProperties> properties;
-
-    using queue_properties_t = std::vector<VkQueueFamilyProperties2>;
-    std::unique_ptr<const queue_properties_t> queue_properties;
+    std::unique_ptr<VkPhysicalDeviceProperties> properties;
+    std::unique_ptr<std::vector<VkQueueFamilyProperties2>> queue_properties;
 
-    // Will be true if the physical device supports everything in
-    // this->required_extensions.
+    // Will be true if the physical device supports all of required_extensions.
     bool supports_required_extensions = false;
 
   public:
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 30e73c1..1192bb6 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -1,44 +1,43 @@
 #include "queue_context.hh"
 #include "device_context.hh"
-#include "layer_context.hh"
+#include "helper.hh"
 #include "timestamp_pool.hh"
 
 #include <span>
+
 #include <vulkan/vulkan_core.h>
 
 namespace low_latency {
 
-QueueContext::CommandPoolOwner::CommandPoolOwner(
-    const QueueContext& queue_context)
-    : queue_context(queue_context) {
+QueueContext::CommandPoolOwner::CommandPoolOwner(const QueueContext& queue)
+    : queue(queue) {
 
-    const auto& device_context = this->queue_context.device_context;
+    const auto& device_context = this->queue.device;
 
     const auto cpci = VkCommandPoolCreateInfo{
         .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
         .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
                  VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
-        .queueFamilyIndex = queue_context.queue_family_index,
+        .queueFamilyIndex = queue.queue_family_index,
     };
 
-    THROW_NON_VKSUCCESS(device_context.vtable.CreateCommandPool(
+    THROW_NOT_VKSUCCESS(device_context.vtable.CreateCommandPool(
         device_context.device, &cpci, nullptr, &this->command_pool));
 }
 
 QueueContext::CommandPoolOwner::~CommandPoolOwner() {
-    const auto& device_context = this->queue_context.device_context;
+    const auto& device_context = this->queue.device;
     device_context.vtable.DestroyCommandPool(device_context.device,
                                              this->command_pool, nullptr);
 }
 
-QueueContext::QueueContext(DeviceContext& device_context, const VkQueue& queue,
+QueueContext::QueueContext(DeviceContext& device, const VkQueue& queue,
                            const std::uint32_t& queue_family_index)
-    : device_context(device_context), queue(queue),
-      queue_family_index(queue_family_index),
+    : device(device), queue(queue), queue_family_index(queue_family_index),
       command_pool(std::make_unique<CommandPoolOwner>(*this)) {
 
     // Only construct a timestamp pool if we support it!
-    if (device_context.physical_device.supports_required_extensions) {
+    if (device.physical_device.supports_required_extensions) {
         this->timestamp_pool = std::make_unique<TimestampPool>(*this);
     }
 }
@@ -77,7 +76,6 @@ void QueueContext::notify_submit(
 
 void QueueContext::notify_present(const VkSwapchainKHR& swapchain,
                                   const present_id_t& present_id) {
-
     // Notify the device that this swapchain was just presented to.
     // We're avoiding a double hash here - don't use operator[] and erase.
     auto iter = this->unpresented_submissions.try_emplace(present_id).first;
@@ -86,24 +84,28 @@ void QueueContext::notify_present(const VkSwapchainKHR& swapchain,
             std::make_shared<std::deque<std::unique_ptr<Submission>>>();
     }
 
-    this->device_context.notify_present(swapchain, iter->second);
+    this->device.notify_present(swapchain, iter->second);
 
     // Important, we nuke the submission because now it's presented.
     this->unpresented_submissions.erase(iter);
 }
 
 bool QueueContext::should_inject_timestamps() const {
-    const auto& physical_device = this->device_context.physical_device;
+    const auto& physical_device = this->device.physical_device;
 
+    // Our layer is a no-op here if we don't support it.
     if (!physical_device.supports_required_extensions) {
         return false;
     }
 
     // Don't bother injecting timestamps during queue submission if we
     // aren't planning on doing anything anyway.
-    if (!this->device_context.was_capability_requested &&
-        !physical_device.instance.layer.is_antilag_1_enabled) {
+    if (!this->device.was_capability_requested) {
+        return false;
+    }
 
+    // Don't do it if we've been marked as 'out of band' by nvidia's extension.
+    if (this->should_ignore_latency) {
         return false;
     }
 
diff --git a/src/queue_context.hh b/src/queue_context.hh
index 48500e1..a52e718 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -22,18 +22,18 @@ class QueueContext final : public Context {
     static constexpr auto MAX_TRACKED_SUBMISSIONS = 50u;
 
   public:
-    DeviceContext& device_context;
+    DeviceContext& device;
 
     const VkQueue queue;
     const std::uint32_t queue_family_index;
 
     struct CommandPoolOwner final {
       private:
-        const QueueContext& queue_context;
+        const QueueContext& queue;
         VkCommandPool command_pool;
 
       public:
-        CommandPoolOwner(const QueueContext& queue_context);
+        CommandPoolOwner(const QueueContext& queue);
         CommandPoolOwner(const CommandPoolOwner&) = delete;
         CommandPoolOwner(CommandPoolOwner&&) = delete;
         CommandPoolOwner operator=(const CommandPoolOwner&) = delete;
@@ -67,7 +67,7 @@ class QueueContext final : public Context {
     //
     // When our hook sees a VkQueuePresentKHR, we take the provided present_id
     // and notify our device that it needs to watch for when this completes.
-    // We give it our submission. Now, it's out of our hands. We remove the
+    // We give it our submissions. Now, it's out of our hands. We remove the
     // present_id_t mapping when doing so.
     struct Submission {
         std::shared_ptr<TimestampPool::Handle> head_handle, tail_handle;
diff --git a/src/swapchain_monitor.cc b/src/swapchain_monitor.cc
index 09fa8ba..adeb315 100644
--- a/src/swapchain_monitor.cc
+++ b/src/swapchain_monitor.cc
@@ -1,5 +1,6 @@
 #include "swapchain_monitor.hh"
 #include "device_context.hh"
+#include "helper.hh"
 
 #include <vulkan/vulkan_core.h>
 
@@ -23,7 +24,7 @@ void SwapchainMonitor::WakeupSemaphore::signal(
         VkSemaphoreSignalInfo{.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO,
                               .semaphore = this->timeline_semaphore,
                               .value = this->value};
-    THROW_NON_VKSUCCESS(device.vtable.SignalSemaphore(device.device, &ssi));
+    THROW_NOT_VKSUCCESS(device.vtable.SignalSemaphore(device.device, &ssi));
 }
 
 void SwapchainMonitor::do_swapchain_monitor(const std::stop_token stoken) {
@@ -109,4 +110,19 @@ void SwapchainMonitor::notify_present(
     this->cv.notify_one();
 }
 
+void SwapchainMonitor::wait_until() {
+    // No reason to lock when using VK_AMD_anti_lag.
+    if (this->in_flight_submissions.empty()) {
+        return;
+    }
+
+    const auto last_submissions = this->in_flight_submissions.back();
+    this->in_flight_submissions.clear();
+    if (last_submissions->empty()) {
+        return;
+    }
+
+    last_submissions->back()->tail_handle->await_time();
+}
+
 } // namespace low_latency
\ No newline at end of file
diff --git a/src/swapchain_monitor.hh b/src/swapchain_monitor.hh
index 5678630..be81d59 100644
--- a/src/swapchain_monitor.hh
+++ b/src/swapchain_monitor.hh
@@ -62,6 +62,10 @@ class SwapchainMonitor {
                           const std::uint64_t& value);
 
     void notify_present(const QueueContext::submissions_t& submissions);
+
+  public:
+    // Synchronously wait until all in-flight submissions have completed.
+    void wait_until();
 };
 
 } // namespace low_latency
diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc
index a37b2bc..4bb236b 100644
--- a/src/timestamp_pool.cc
+++ b/src/timestamp_pool.cc
@@ -1,7 +1,9 @@
 #include "timestamp_pool.hh"
 #include "device_context.hh"
+#include "helper.hh"
 #include "queue_context.hh"
 
+#include <mutex>
 #include <ranges>
 #include <span>
 #include <vulkan/utility/vk_dispatch_table.h>
@@ -13,18 +15,18 @@ TimestampPool::QueryChunk::QueryPoolOwner::QueryPoolOwner(
     const QueueContext& queue_context)
     : queue_context(queue_context) {
 
-    const auto& device_context = this->queue_context.device_context;
+    const auto& device_context = this->queue_context.device;
     const auto qpci =
         VkQueryPoolCreateInfo{.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
                               .queryType = VK_QUERY_TYPE_TIMESTAMP,
                               .queryCount = QueryChunk::CHUNK_SIZE};
 
-    THROW_NON_VKSUCCESS(device_context.vtable.CreateQueryPool(
+    THROW_NOT_VKSUCCESS(device_context.vtable.CreateQueryPool(
         device_context.device, &qpci, nullptr, &this->query_pool));
 }
 
 TimestampPool::QueryChunk::QueryPoolOwner::~QueryPoolOwner() {
-    const auto& device_context = this->queue_context.device_context;
+    const auto& device_context = this->queue_context.device;
     device_context.vtable.DestroyQueryPool(device_context.device,
                                            this->query_pool, nullptr);
 }
@@ -43,7 +45,7 @@ TimestampPool::QueryChunk::CommandBuffersOwner::CommandBuffersOwner(
     const QueueContext& queue_context)
     : queue_context(queue_context), command_buffers(CHUNK_SIZE) {
 
-    const auto& device_context = queue_context.device_context;
+    const auto& device_context = queue_context.device;
 
     const auto cbai = VkCommandBufferAllocateInfo{
         .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
@@ -51,12 +53,12 @@ TimestampPool::QueryChunk::CommandBuffersOwner::CommandBuffersOwner(
         .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
         .commandBufferCount = CHUNK_SIZE,
     };
-    THROW_NON_VKSUCCESS(device_context.vtable.AllocateCommandBuffers(
+    THROW_NOT_VKSUCCESS(device_context.vtable.AllocateCommandBuffers(
         device_context.device, &cbai, std::data(this->command_buffers)));
 }
 
 TimestampPool::QueryChunk::CommandBuffersOwner::~CommandBuffersOwner() {
-    const auto& device_context = this->queue_context.device_context;
+    const auto& device_context = this->queue_context.device;
 
     device_context.vtable.FreeCommandBuffers(
         device_context.device, *this->queue_context.command_pool,
@@ -64,6 +66,13 @@ TimestampPool::QueryChunk::CommandBuffersOwner::~CommandBuffersOwner() {
         std::data(this->command_buffers));
 }
 
+VkCommandBuffer TimestampPool::QueryChunk::CommandBuffersOwner::operator[](
+    const std::size_t& i) {
+
+    assert(i < CHUNK_SIZE);
+    return this->command_buffers[i];
+}
+
 TimestampPool::QueryChunk::~QueryChunk() {}
 
 TimestampPool::TimestampPool(QueueContext& queue_context)
@@ -75,6 +84,7 @@ TimestampPool::TimestampPool(QueueContext& queue_context)
 }
 
 std::shared_ptr<TimestampPool::Handle> TimestampPool::acquire() {
+    const auto lock = std::scoped_lock{this->mutex};
 
     // Gets the empty one, or inserts a new one and returns it.
     const auto not_empty_iter = [this]() -> auto {
@@ -97,12 +107,12 @@ std::shared_ptr<TimestampPool::Handle> TimestampPool::acquire() {
     // Grab any element from our set and erase it immediately after.
     auto& indices = *(*not_empty_iter)->free_indices;
     const auto query_index = *std::begin(indices);
-    assert(indices.erase(query_index));
+    indices.erase(query_index);
 
     return std::make_shared<Handle>(*this, *not_empty_iter, query_index);
 }
 
-TimestampPool::Handle::Handle(const TimestampPool& timestamp_pool,
+TimestampPool::Handle::Handle(TimestampPool& timestamp_pool,
                               const std::shared_ptr<QueryChunk>& origin_chunk,
                               const std::uint64_t& query_index)
     : timestamp_pool(timestamp_pool), origin_chunk(origin_chunk),
@@ -110,10 +120,12 @@ TimestampPool::Handle::Handle(const TimestampPool& timestamp_pool,
       command_buffer((*origin_chunk->command_buffers)[query_index]) {}
 
 TimestampPool::Handle::~Handle() {
+    const auto lock = std::scoped_lock{this->timestamp_pool.mutex};
+
     // Parent destructing shouldn't mean we should have a bunch of
     // insertions for zero reason.
     if (const auto ptr = this->origin_chunk.lock(); ptr) {
-        assert(ptr->free_indices->insert(this->query_index).second);
+        ptr->free_indices->insert(this->query_index);
     }
 }
 
@@ -124,32 +136,32 @@ void TimestampPool::Handle::setup_command_buffers(
         .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
     };
 
-    const auto& device_context = queue_context.device_context;
+    const auto& device_context = queue_context.device;
     const auto& vtable = device_context.vtable;
 
     vtable.ResetQueryPoolEXT(device_context.device, this->query_pool,
                              static_cast<std::uint32_t>(this->query_index), 1);
 
-    THROW_NON_VKSUCCESS(vtable.ResetCommandBuffer(this->command_buffer, 0));
-    THROW_NON_VKSUCCESS(vtable.BeginCommandBuffer(this->command_buffer, &cbbi));
+    THROW_NOT_VKSUCCESS(vtable.ResetCommandBuffer(this->command_buffer, 0));
+    THROW_NOT_VKSUCCESS(vtable.BeginCommandBuffer(this->command_buffer, &cbbi));
 
     vtable.CmdWriteTimestamp2KHR(
         this->command_buffer, VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
         this->query_pool, static_cast<std::uint32_t>(this->query_index));
 
-    THROW_NON_VKSUCCESS(vtable.EndCommandBuffer(this->command_buffer));
+    THROW_NOT_VKSUCCESS(vtable.EndCommandBuffer(this->command_buffer));
 
     vtable.ResetQueryPoolEXT(device_context.device, tail.query_pool,
                              static_cast<std::uint32_t>(tail.query_index), 1);
 
-    THROW_NON_VKSUCCESS(vtable.ResetCommandBuffer(tail.command_buffer, 0));
-    THROW_NON_VKSUCCESS(vtable.BeginCommandBuffer(tail.command_buffer, &cbbi));
+    THROW_NOT_VKSUCCESS(vtable.ResetCommandBuffer(tail.command_buffer, 0));
+    THROW_NOT_VKSUCCESS(vtable.BeginCommandBuffer(tail.command_buffer, &cbbi));
 
     vtable.CmdWriteTimestamp2KHR(
         tail.command_buffer, VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
         tail.query_pool, static_cast<std::uint32_t>(tail.query_index));
 
-    THROW_NON_VKSUCCESS(vtable.EndCommandBuffer(tail.command_buffer));
+    THROW_NOT_VKSUCCESS(vtable.EndCommandBuffer(tail.command_buffer));
 }
 
 struct QueryResult {
@@ -157,7 +169,7 @@ struct QueryResult {
     std::uint64_t available;
 };
 std::optional<DeviceClock::time_point_t> TimestampPool::Handle::get_time() {
-    const auto& context = this->timestamp_pool.queue_context.device_context;
+    const auto& context = this->timestamp_pool.queue_context.device;
     const auto& vtable = context.vtable;
 
     auto query_result = QueryResult{};
@@ -180,7 +192,7 @@ std::optional<DeviceClock::time_point_t> TimestampPool::Handle::get_time() {
 }
 
 DeviceClock::time_point_t TimestampPool::Handle::await_time() {
-    const auto& context = this->timestamp_pool.queue_context.device_context;
+    const auto& context = this->timestamp_pool.queue_context.device;
     const auto& vtable = context.vtable;
 
     struct QueryResult {
@@ -189,7 +201,7 @@ DeviceClock::time_point_t TimestampPool::Handle::await_time() {
     };
     auto query_result = QueryResult{};
 
-    THROW_NON_VKSUCCESS(vtable.GetQueryPoolResults(
+    THROW_NOT_VKSUCCESS(vtable.GetQueryPoolResults(
         context.device, query_pool,
         static_cast<std::uint32_t>(this->query_index), 1, sizeof(query_result),
         &query_result, sizeof(query_result),
diff --git a/src/timestamp_pool.hh b/src/timestamp_pool.hh
index 0d6c52d..d8ee359 100644
--- a/src/timestamp_pool.hh
+++ b/src/timestamp_pool.hh
@@ -3,7 +3,7 @@
 
 // The purpose of this file is to provide the definition of a 'timestamp pool'.
 // It manages blocks of timestamp query pools, hands them out when requested,
-// and allocates more when (if) we run out.
+// and allocates more when (if) we run out. It _should_ be thread safe.
 // Usage:
 //     1. Get handle with .acquire().
 //     2. Write start/end timestamp operations with the handle's pool and index
@@ -15,6 +15,7 @@
 #include <vulkan/vulkan.hpp>
 
 #include <memory>
+#include <mutex>
 #include <unordered_set>
 #include <vector>
 
@@ -28,6 +29,7 @@ class DeviceContext;
 class TimestampPool final {
   private:
     QueueContext& queue_context;
+    std::mutex mutex;
 
     // A chunk of data which is useful for making timestamp queries.
     // Allows association of an index to a query pool and command buffer.
@@ -72,10 +74,7 @@ class TimestampPool final {
             ~CommandBuffersOwner();
 
           public:
-            VkCommandBuffer operator[](const std::size_t& i) {
-                assert(i < CHUNK_SIZE);
-                return this->command_buffers[i];
-            }
+            VkCommandBuffer operator[](const std::size_t& i);
         };
         std::unique_ptr<CommandBuffersOwner> command_buffers;
 
@@ -98,7 +97,7 @@ class TimestampPool final {
         friend class TimestampPool;
 
       private:
-        const TimestampPool& timestamp_pool;
+        TimestampPool& timestamp_pool;
         const std::weak_ptr<QueryChunk> origin_chunk;
 
       public:
@@ -107,7 +106,7 @@ class TimestampPool final {
         const VkCommandBuffer command_buffer;
 
       public:
-        Handle(const TimestampPool& timestamp_pool,
+        Handle(TimestampPool& timestamp_pool,
                const std::shared_ptr<QueryChunk>& origin_chunk,
                const std::uint64_t& query_index);
         Handle(const Handle& handle) = delete;
-- 
cgit v1.2.3