Implement Reflex - break AntiLag in the process. Remove AntiLag1. WIP

author: Nicolas James <nj3ahxac@gmail.com> 2026-03-29 20:44:23 +1100
committer: Nicolas James <nj3ahxac@gmail.com> 2026-03-29 20:44:23 +1100
commit: 681bd5096ee416b50dd7338de30af7b3db385a36 (patch)
tree: 358b6bc6f9a3af66729b8ac3b15dd38cc0f4bd2a
parent: d5ef2dbbd77c69dd93e92d5b7046a65c2361b59b (diff)
14 files changed, 575 insertions, 626 deletions
diff --git a/src/device_clock.cc b/src/device_clock.cc
new file mode 100644
index 0000000..52c86d3
--- /dev/null
+++ b/src/device_clock.cc
@@ -0,0 +1,73 @@
+#include "device_clock.hh"
+#include "device_context.hh"
+
+#include <vulkan/vulkan_core.h>
+
+#include <cassert>
+#include <time.h>
+
+namespace low_latency {
+
+DeviceClock::DeviceClock(const DeviceContext& context) : device(context) {
+    this->calibrate();
+}
+
+DeviceClock::~DeviceClock() {}
+
+DeviceClock::time_point_t DeviceClock::now() {
+    auto ts = timespec{};
+    if (clock_gettime(CLOCK_MONOTONIC, &ts)) {
+        throw errno;
+    }
+
+    return time_point_t{std::chrono::seconds{ts.tv_sec} +
+                        std::chrono::nanoseconds{ts.tv_nsec}};
+}
+
+void DeviceClock::calibrate() {
+    const auto infos = std::vector<VkCalibratedTimestampInfoKHR>{
+        {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr,
+         VK_TIME_DOMAIN_DEVICE_EXT},
+        {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr,
+         VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT}};
+
+    struct CalibratedResult {
+        std::uint64_t device;
+        std::uint64_t host;
+    };
+    auto calibrated_result = CalibratedResult{};
+
+    THROW_NON_VKSUCCESS(device.vtable.GetCalibratedTimestampsKHR(
+        device.device, 2, std::data(infos), &calibrated_result.device,
+        &this->error_bound));
+
+    this->device_ticks = calibrated_result.device;
+    this->host_ns = calibrated_result.host;
+}
+
+DeviceClock::time_point_t
+DeviceClock::ticks_to_time(const std::uint64_t& ticks) const {
+    const auto& pd = device.physical_device.properties;
+    const auto ns_tick = static_cast<double>(pd->limits.timestampPeriod);
+
+    const auto diff = [&]() -> auto {
+        auto a = this->device_ticks;
+        auto b = ticks;
+        const auto is_negative = a > b;
+        if (is_negative) {
+            std::swap(a, b);
+        }
+        const auto abs_diff = b - a;
+        assert(abs_diff <= std::numeric_limits<std::int64_t>::max());
+        const auto signed_abs_diff = static_cast<std::int64_t>(abs_diff);
+        return is_negative ? -signed_abs_diff : signed_abs_diff;
+    }();
+
+    const auto diff_nsec =
+        static_cast<std::int64_t>(static_cast<double>(diff) * ns_tick + 0.5);
+    const auto delta = std::chrono::nanoseconds(
+        this->host_ns + static_cast<std::uint64_t>(diff_nsec));
+    return time_point_t{delta};
+}
+
+} // namespace low_latency
+\ No newline at end of file
diff --git a/src/device_clock.hh b/src/device_clock.hh
new file mode 100644
index 0000000..a52c59c
--- /dev/null
+++ b/src/device_clock.hh
@@ -0,0 +1,49 @@
+#ifndef CLOCK_HH_
+#define CLOCK_HH_
+
+#include <chrono>
+
+// This header provides a DeviceClock that abstracts away the Vulkan details of
+// comparing CPU and GPU times.
+
+namespace low_latency {
+
+class DeviceContext;
+
+class DeviceClock final {
+  public:
+    // FIXME this is bad, see now().
+    using time_point_t = std::chrono::time_point<std::chrono::steady_clock,
+                                                 std::chrono::nanoseconds>;
+    const DeviceContext& device;
+
+  public:
+    std::uint64_t host_ns;
+    std::uint64_t error_bound;
+    std::uint64_t device_ticks;
+
+  public:
+    DeviceClock(const DeviceContext& device);
+    DeviceClock(const DeviceClock&) = delete;
+    DeviceClock(DeviceClock&&) = delete;
+    DeviceClock operator=(const DeviceClock&) = delete;
+    DeviceClock operator=(DeviceClock&&) = delete;
+    ~DeviceClock();
+
+  public:
+    // WARNING: This *MUST* be used over std::chrono::steady_clock::now if
+    // you're planning on comparing it to a device's clock. If it isn't, the
+    // timestamps might from different domains and will be completely
+    // nonsensical.
+    // FIXME we should be able to fix this with a tiny wrapper class of
+    // time_point_t that enforces typesafety.
+    static time_point_t now();
+
+  public:
+    void calibrate();
+    time_point_t ticks_to_time(const std::uint64_t& ticks) const;
+};
+
+} // namespace low_latency
+
+#endif
+\ No newline at end of file
diff --git a/src/device_context.cc b/src/device_context.cc
index 58737e2..5438e40 100644
--- a/src/device_context.cc
+++ b/src/device_context.cc
@@ -1,6 +1,5 @@
 #include "device_context.hh"
 
-#include <time.h>
 #include <utility>
 #include <vulkan/vulkan_core.h>
 
@@ -9,15 +8,15 @@ namespace low_latency {
 DeviceContext::DeviceContext(InstanceContext& parent_instance,
                              PhysicalDeviceContext& parent_physical_device,
                              const VkDevice& device,
-                             const bool was_antilag_requested,
+                             const bool was_capability_requested,
                              VkuDeviceDispatchTable&& vtable)
     : instance(parent_instance), physical_device(parent_physical_device),
-      was_antilag_requested(was_antilag_requested), device(device),
+      was_capability_requested(was_capability_requested), device(device),
       vtable(std::move(vtable)) {
 
     // Only create our clock if we can support creating it.
     if (this->physical_device.supports_required_extensions) {
-        this->clock = std::make_unique<Clock>(*this);
+        this->clock = std::make_unique<DeviceClock>(*this);
     }
 }
 
@@ -29,72 +28,10 @@ DeviceContext::~DeviceContext() {
     }
 }
 
-DeviceContext::Clock::Clock(const DeviceContext& context) : device(context) {
-    this->calibrate();
-}
-
-DeviceContext::Clock::~Clock() {}
-
-DeviceContext::Clock::time_point_t DeviceContext::Clock::now() {
-    auto ts = timespec{};
-    if (clock_gettime(CLOCK_MONOTONIC, &ts)) {
-        throw errno;
-    }
-
-    return time_point_t{std::chrono::seconds{ts.tv_sec} +
-                        std::chrono::nanoseconds{ts.tv_nsec}};
-}
-
-void DeviceContext::Clock::calibrate() {
-    const auto infos = std::vector<VkCalibratedTimestampInfoKHR>{
-        {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr,
-         VK_TIME_DOMAIN_DEVICE_EXT},
-        {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr,
-         VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT}};
-
-    struct CalibratedResult {
-        std::uint64_t device;
-        std::uint64_t host;
-    };
-    auto calibrated_result = CalibratedResult{};
-
-    THROW_NON_VKSUCCESS(device.vtable.GetCalibratedTimestampsKHR(
-        device.device, 2, std::data(infos), &calibrated_result.device,
-        &this->error_bound));
-
-    this->device_ticks = calibrated_result.device;
-    this->host_ns = calibrated_result.host;
-}
-
-DeviceContext::Clock::time_point_t
-DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const {
-    const auto& pd = device.physical_device.properties;
-    const auto ns_tick = static_cast<double>(pd->limits.timestampPeriod);
-
-    const auto diff = [&]() -> auto {
-        auto a = this->device_ticks;
-        auto b = ticks;
-        const auto is_negative = a > b;
-        if (is_negative) {
-            std::swap(a, b);
-        }
-        const auto abs_diff = b - a;
-        assert(abs_diff <= std::numeric_limits<std::int64_t>::max());
-        const auto signed_abs_diff = static_cast<std::int64_t>(abs_diff);
-        return is_negative ? -signed_abs_diff : signed_abs_diff;
-    }();
-
-    const auto diff_nsec =
-        static_cast<std::int64_t>(static_cast<double>(diff) * ns_tick + 0.5);
-    const auto delta = std::chrono::nanoseconds(
-        this->host_ns + static_cast<std::uint64_t>(diff_nsec));
-    return time_point_t{delta};
-}
-
+/*
 void DeviceContext::sleep_in_input() {
     // TODO
 
-    /*
     // Present hasn't happened yet, we don't know what queue to attack.
     if (!this->present_queue) {
         return;
@@ -122,30 +59,36 @@ void DeviceContext::sleep_in_input() {
     // would get huge frame drops, loss of throughput, and the GPU would even
     // clock down. So naturally I am concerned about this approach, but it seems
     // to perform well so far in my own testing and is just beautifully elegant.
-    */
 }
+*/
 
-void DeviceContext::update_swapchain_infos(
+void DeviceContext::update_params(
     const std::optional<VkSwapchainKHR> target,
     const std::chrono::milliseconds& present_delay,
     const bool was_low_latency_requested) {
 
-    const auto write = SwapchainInfo{
-        .present_delay = present_delay,
-        .was_low_latency_requested = was_low_latency_requested,
-    };
-
-    if (target.has_value()) {
-        const auto iter = this->swapchain_infos.find(*target);
-        assert(iter != std::end(this->swapchain_infos)); // Must exist (spec).
-        iter->second = write;
+    // If we don't have a target (AMD's anti_lag doesn't differentiate between
+    // swapchains), just write it to everything.
+    if (!target.has_value()) {
+        for (auto& iter : this->swapchain_monitors) {
+            iter.second.update_params(was_low_latency_requested, present_delay);
+        }
         return;
     }
 
-    // If we don't have a target (AMD's anti_lag), just write it to everything.
-    for (auto& iter : this->swapchain_infos) {
-        iter.second = write;
-    }
+    const auto iter = this->swapchain_monitors.find(*target);
+    assert(iter != std::end(this->swapchain_monitors));
+    iter->second.update_params(was_low_latency_requested, present_delay);
+}
+
+void DeviceContext::notify_present(
+    const VkSwapchainKHR& swapchain,
+    const QueueContext::submissions_t& submissions) {
+
+    const auto iter = this->swapchain_monitors.find(swapchain);
+    assert(iter != std::end(this->swapchain_monitors));
+
+    iter->second.notify_present(submissions);
 }
 
 } // namespace low_latency
 \ No newline at end of file
diff --git a/src/device_context.hh b/src/device_context.hh
index 6b5f000..172801c 100644
--- a/src/device_context.hh
+++ b/src/device_context.hh
@@ -11,59 +11,30 @@
 #include <vulkan/vulkan_core.h>
 
 #include "context.hh"
+#include "device_clock.hh"
 #include "instance_context.hh"
 #include "physical_device_context.hh"
+#include "queue_context.hh"
+#include "swapchain_monitor.hh"
 
 namespace low_latency {
 
-class QueueContext;
-
-struct DeviceContext final : public Context {
+class DeviceContext final : public Context {
   public:
     InstanceContext& instance;
     PhysicalDeviceContext& physical_device;
 
-    const bool was_antilag_requested;
+    // Whether or not we were asked to do NV_VK_LowLatency2 or VK_AMD_anti_lag.
+    const bool was_capability_requested;
 
     const VkDevice device;
     const VkuDeviceDispatchTable vtable;
 
-    // Tiny struct to represent any swapchain's low latency state.
-    struct SwapchainInfo {
-        std::chrono::milliseconds present_delay = std::chrono::milliseconds{0};
-        bool was_low_latency_requested = false;
-    };
-    std::unordered_map<VkSwapchainKHR, SwapchainInfo> swapchain_infos{};
-
     std::unordered_map<VkQueue, std::shared_ptr<QueueContext>> queues;
 
-    struct Clock {
-      public:
-        using time_point_t = std::chrono::time_point<std::chrono::steady_clock,
-                                                     std::chrono::nanoseconds>;
-        const DeviceContext& device;
-
-      public:
-        std::uint64_t host_ns;
-        std::uint64_t error_bound;
-        std::uint64_t device_ticks;
-
-      public:
-        Clock(const DeviceContext& device);
-        ~Clock();
+    std::unique_ptr<DeviceClock> clock;
 
-      public:
-        // WARNING: This *MUST* be used over std::chrono::steady_clock::now if
-        // you're planning on comparing it to a device's clock. If it isn't, the
-        // timestamps might from different domains and will be completely
-        // nonsensical.
-        static time_point_t now();
-
-      public:
-        void calibrate();
-        time_point_t ticks_to_time(const std::uint64_t& ticks) const;
-    };
-    std::unique_ptr<Clock> clock;
+    std::unordered_map<VkSwapchainKHR, SwapchainMonitor> swapchain_monitors;
 
   public:
     DeviceContext(InstanceContext& parent_instance,
@@ -73,13 +44,14 @@ struct DeviceContext final : public Context {
     virtual ~DeviceContext();
 
   public:
-    void sleep_in_input();
-
     // Updates the settings associated with that swapchain. If none is provided
     // all swapchains are set to this value.
-    void update_swapchain_infos(const std::optional<VkSwapchainKHR> target,
-                                const std::chrono::milliseconds& present_delay,
-                                const bool was_low_latency_requested);
+    void update_params(const std::optional<VkSwapchainKHR> target,
+                       const std::chrono::milliseconds& present_delay,
+                       const bool was_low_latency_requested);
+
+    void notify_present(const VkSwapchainKHR& swapchain,
+                        const QueueContext::submissions_t& submissions);
 };
 
 }; // namespace low_latency
diff --git a/src/helper.cc b/src/helper.cc
new file mode 100644
index 0000000..bb17c59
--- /dev/null
+++ b/src/helper.cc
@@ -0,0 +1,5 @@
+#include "helper.hh"
+
+namespace low_latency {
+
+}
+\ No newline at end of file
diff --git a/src/helper.hh b/src/helper.hh
new file mode 100644
index 0000000..468f146
--- /dev/null
+++ b/src/helper.hh
@@ -0,0 +1,59 @@
+#ifndef HELPER_HH_
+#define HELPER_HH_
+
+#include <vulkan/vk_layer.h>
+#include <vulkan/vulkan.h>
+
+#include <cstdint>
+
+namespace low_latency {
+
+// Small templates which allow us to SFINAE find pNext structs.
+template <typename T>
+static T* find_next(void* const head, const VkStructureType& stype) {
+    for (auto i = reinterpret_cast<VkBaseOutStructure*>(head)->pNext; i;
+         i = i->pNext) {
+
+        if (i->sType == stype) {
+            return reinterpret_cast<T*>(i);
+        }
+    }
+    return nullptr;
+}
+
+template <typename T>
+static const T* find_next(const void* const head,
+                          const VkStructureType& stype) {
+
+    for (auto i = reinterpret_cast<const VkBaseInStructure*>(head)->pNext; i;
+         i = i->pNext) {
+
+        if (i->sType == stype) {
+            return reinterpret_cast<const T*>(i);
+        }
+    }
+    return nullptr;
+}
+
+template <typename T>
+static const T* find_link(const void* const head,
+                          const VkStructureType& stype) {
+    for (auto info = find_next<T>(head, stype); info;
+         info = find_next<T>(info, stype)) {
+
+        if (info->function == VK_LAYER_LINK_INFO) {
+            return reinterpret_cast<const T*>(info);
+        }
+    }
+    return nullptr;
+}
+
+template <typename T> std::uint64_t extract_present_id(const T& submit) {
+    const auto lspi = find_next<VkLatencySubmissionPresentIdNV>(
+        &submit, VK_STRUCTURE_TYPE_LATENCY_SUBMISSION_PRESENT_ID_NV);
+    return lspi ? lspi->presentID : 0;
+}
+
+} // namespace low_latency
+
+#endif
+\ No newline at end of file
diff --git a/src/layer.cc b/src/layer.cc
index 5460fca..7a7ffc8 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -14,7 +14,9 @@
 #include <vulkan/vulkan.hpp>
 #include <vulkan/vulkan_core.h>
 
+#include "device_clock.hh"
 #include "device_context.hh"
+#include "helper.hh"
 #include "instance_context.hh"
 #include "layer_context.hh"
 #include "queue_context.hh"
@@ -28,46 +30,6 @@ LayerContext layer_context;
 
 } // namespace
 
-// Small templates which allow us to SFINAE find pNext structs.
-template <typename T>
-static T* find_next(void* const head, const VkStructureType& stype) {
-    for (auto i = reinterpret_cast<VkBaseOutStructure*>(head)->pNext; i;
-         i = i->pNext) {
-
-        if (i->sType == stype) {
-            return reinterpret_cast<T*>(i);
-        }
-    }
-    return nullptr;
-}
-
-template <typename T>
-static const T* find_next(const void* const head,
-                          const VkStructureType& stype) {
-
-    for (auto i = reinterpret_cast<const VkBaseInStructure*>(head)->pNext; i;
-         i = i->pNext) {
-
-        if (i->sType == stype) {
-            return reinterpret_cast<const T*>(i);
-        }
-    }
-    return nullptr;
-}
-
-template <typename T>
-static const T* find_link(const void* const head,
-                          const VkStructureType& stype) {
-    for (auto info = find_next<T>(head, stype); info;
-         info = find_next<T>(info, stype)) {
-
-        if (info->function == VK_LAYER_LINK_INFO) {
-            return reinterpret_cast<const T*>(info);
-        }
-    }
-    return nullptr;
-}
-
 static VKAPI_ATTR VkResult VKAPI_CALL
 CreateInstance(const VkInstanceCreateInfo* pCreateInfo,
                const VkAllocationCallbacks* pAllocator, VkInstance* pInstance) {
@@ -209,12 +171,12 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
     // is not the case with AL2, because the vulkan application has to
     // explicitly ask for the extension when it creates the device.
 
-    const auto was_antilag_requested =
+    const auto was_capability_requested =
         requested.contains(VK_AMD_ANTI_LAG_EXTENSION_NAME) ||
         requested.contains(VK_NV_LOW_LATENCY_2_EXTENSION_NAME);
 
     const auto context = layer_context.get_context(physical_device);
-    if (!context->supports_required_extensions && was_antilag_requested) {
+    if (!context->supports_required_extensions && was_capability_requested) {
         return VK_ERROR_INITIALIZATION_FAILED;
     }
 
@@ -305,7 +267,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
     assert(!layer_context.contexts.contains(key));
     layer_context.contexts.try_emplace(
         key, std::make_shared<DeviceContext>(context->instance, *context,
-                                             *pDevice, was_antilag_requested,
+                                             *pDevice, was_capability_requested,
                                              std::move(vtable)));
 
     return VK_SUCCESS;
@@ -443,7 +405,7 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
     // more explicit + insurance if that changes.
     auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{};
 
-    const auto now = DeviceContext::Clock::now();
+    const auto now = DeviceClock::now();
 
     std::ranges::transform(
         std::span{submit_infos, submit_count}, std::back_inserter(next_submits),
@@ -451,7 +413,9 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
             const auto head_handle = context->timestamp_pool->acquire();
             const auto tail_handle = context->timestamp_pool->acquire();
             head_handle->setup_command_buffers(*tail_handle, *context);
-            context->notify_submit(submit, head_handle, tail_handle, now);
+
+            context->notify_submit(extract_present_id(submit), head_handle,
+                                   tail_handle, now);
 
             handles.emplace_back(head_handle);
             handles.emplace_back(tail_handle);
@@ -494,7 +458,7 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
     auto next_cbs = std::vector<std::unique_ptr<cbs_t>>{};
     auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{};
 
-    const auto now = DeviceContext::Clock::now();
+    const auto now = DeviceClock::now();
 
     std::ranges::transform(
         std::span{submit_infos, submit_count}, std::back_inserter(next_submits),
@@ -502,7 +466,9 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
             const auto head_handle = context->timestamp_pool->acquire();
             const auto tail_handle = context->timestamp_pool->acquire();
             head_handle->setup_command_buffers(*tail_handle, *context);
-            context->notify_submit(submit, head_handle, tail_handle, now);
+
+            context->notify_submit(extract_present_id(submit), head_handle,
+                                   tail_handle, now);
 
             handles.emplace_back(head_handle);
             handles.emplace_back(tail_handle);
@@ -553,7 +519,14 @@ vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) {
         return res;
     }
 
-    context->notify_present(*present_info);
+    const auto pid = find_next<VkPresentIdKHR>(
+        present_info, VK_STRUCTURE_TYPE_PRESENT_ID_KHR);
+
+    for (auto i = std::uint32_t{0}; i < present_info->swapchainCount; ++i) {
+        const auto& swapchain = present_info->pSwapchains[i];
+        const auto present_id = pid ? pid->pPresentIds[i] : 0;
+        context->notify_present(swapchain, present_id);
+    }
 
     return VK_SUCCESS;
 }
@@ -644,6 +617,17 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2(
 
     vtable.GetPhysicalDeviceFeatures2(physical_device, pFeatures);
 
+    // We're going to use this feature for both VK_AMD_anti_lag and
+    // VK_NV_low_latency2. It simplifies things a bit if we share a code path
+    // for now. TODO remove it in the future for VK_AMD_anti_lag.
+    if (const auto pidf = find_next<VkPhysicalDevicePresentIdFeaturesKHR>(
+            pFeatures,
+            VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENT_ID_FEATURES_KHR);
+        pidf) {
+
+        pidf->presentId = true;
+    }
+
     // Don't provide AntiLag if we're trying to spoof nvidia.
     // Nvidia uses VkSurfaceCapabilities2KHR to determine if a surface
     // is capable of reflex instead of AMD's physical device switch found here.
@@ -651,11 +635,11 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2(
         return;
     }
 
-    const auto feature = find_next<VkPhysicalDeviceAntiLagFeaturesAMD>(
-        pFeatures, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD);
+    if (const auto alf = find_next<VkPhysicalDeviceAntiLagFeaturesAMD>(
+            pFeatures, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD);
+        alf) {
 
-    if (feature) {
-        feature->antiLag = context->supports_required_extensions;
+        alf->antiLag = context->supports_required_extensions;
     }
 }
 
@@ -707,12 +691,11 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR(
     const auto lsc = find_next<VkLatencySurfaceCapabilitiesNV>(
         pSurfaceCapabilities,
         VK_STRUCTURE_TYPE_LATENCY_SURFACE_CAPABILITIES_NV);
-
     if (!lsc) {
         return;
     }
 
-    // I kind of eyeballed these!
+    // I eyeballed these - there might be more that we can support.
     const auto supported_modes = std::vector<VkPresentModeKHR>{
         VK_PRESENT_MODE_IMMEDIATE_KHR,
         VK_PRESENT_MODE_MAILBOX_KHR,
@@ -723,7 +706,7 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR(
 
     // They're asking how many we want to return.
     if (!lsc->pPresentModes) {
-        lsc->presentModeCount = static_cast<std::uint32_t>(num_supported_modes);
+        lsc->presentModeCount = num_supported_modes;
         return;
     }
 
@@ -750,19 +733,17 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateSwapchainKHR(
         return result;
     }
 
-    auto addition = DeviceContext::SwapchainInfo{
-        .present_delay = std::chrono::milliseconds{0},
-        .was_low_latency_requested = false,
-    };
-
+    // VK_NV_low_latency2 allows a swapchain to be created with the low latency
+    // mode already on via VkSwapchainLatencyCreateInfoNV.
+    auto was_low_latency_requested = false;
     if (const auto slci = find_next<VkSwapchainLatencyCreateInfoNV>(
             pCreateInfo, VK_STRUCTURE_TYPE_SWAPCHAIN_LATENCY_CREATE_INFO_NV);
         slci) {
-        
-        addition.was_low_latency_requested = slci->latencyModeEnable;
-    }
 
-    assert(context->swapchain_infos.try_emplace(*pSwapchain, addition).second);
+        was_low_latency_requested = slci->latencyModeEnable;
+    }
+    context->swapchain_monitors.try_emplace(*pSwapchain, *context,
+                                            was_low_latency_requested);
 
     return VK_SUCCESS;
 }
@@ -772,7 +753,7 @@ DestroySwapchainKHR(VkDevice device, VkSwapchainKHR swapchain,
                     const VkAllocationCallbacks* pAllocator) {
     const auto context = layer_context.get_context(device);
 
-    assert(context->swapchain_infos.erase(swapchain));
+    assert(context->swapchain_monitors.erase(swapchain));
 
     context->vtable.DestroySwapchainKHR(device, swapchain, pAllocator);
 }
@@ -788,20 +769,20 @@ AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) {
     // NVIDIA's method and then have a working AL2 implementation follow using
     // that existing code path.
 
-    const auto present_delay = [&]() { // lambda abuse?
+    const auto present_delay = [&]() {
         using namespace std::chrono;
         return duration_cast<milliseconds>(1s / pData->maxFPS);
     }();
 
-    context->update_swapchain_infos(std::nullopt, present_delay,
-                                    (pData->mode == VK_ANTI_LAG_MODE_ON_AMD));
+    context->update_params(std::nullopt, present_delay,
+                           (pData->mode == VK_ANTI_LAG_MODE_ON_AMD));
 
     if (!pData->pPresentationInfo) {
         return;
     }
 
     if (pData->pPresentationInfo->stage == VK_ANTI_LAG_STAGE_INPUT_AMD) {
-        context->sleep_in_input();
+        // TODO use nvidia's path
     }
 }
 
@@ -811,16 +792,25 @@ VkResult LatencySleepNV(VkDevice device, VkSwapchainKHR swapchain,
     const auto context = layer_context.get_context(device);
     assert(pSleepInfo);
 
-    // TODO sleep here
+    // We're associating an application-provided timeline semaphore + value with
+    // a swapchain that says 'signal me when we should move past input'.
+    auto& swapchain_monitor = [&]() -> auto& {
+        const auto iter = context->swapchain_monitors.find(swapchain);
+        assert(iter != std::end(context->swapchain_monitors));
+        return iter->second;
+    }();
+
+    // Tell our swapchain monitor that if they want us to proceed they should
+    // signal this semaphore.
+    swapchain_monitor.notify_semaphore(pSleepInfo->signalSemaphore,
+                                       pSleepInfo->value);
 
     return VK_SUCCESS;
 }
 
 void QueueNotifyOutOfBandNV(VkQueue queue,
                             const VkOutOfBandQueueTypeInfoNV* pQueueTypeInfo) {
-    // This is really thoughtful from NVIDIA. Having the application explicitly
-    // state which queues should be ignored for latency evaluation is far
-    // superior to AMD's guessing game.
+
     // Kind of interesting how you can't turn it back on once it's turned off.
     // Also I really have no idea why pQueueTypeInfo's VkOutOfBandQueueTypeNV
     // enum even exists (I guess we will find out later when nothing works).
@@ -834,14 +824,13 @@ VkResult SetLatencySleepModeNV(VkDevice device, VkSwapchainKHR swapchain,
     const auto context = layer_context.get_context(device);
 
     if (pSleepModeInfo) {
-        context->update_swapchain_infos(
+        context->update_params(
             swapchain,
             std::chrono::milliseconds{pSleepModeInfo->minimumIntervalUs},
             pSleepModeInfo->lowLatencyMode);
     } else {
         // If pSleepModeInfo is nullptr, it means no delay and no low latency.
-        context->update_swapchain_infos(swapchain, std::chrono::milliseconds{0},
-                                        false);
+        context->update_params(swapchain, std::chrono::milliseconds{0}, false);
     }
     return VK_SUCCESS;
 }
diff --git a/src/physical_device_context.hh b/src/physical_device_context.hh
index 9624faa..f7ad289 100644
--- a/src/physical_device_context.hh
+++ b/src/physical_device_context.hh
@@ -4,6 +4,7 @@
 #include "instance_context.hh"
 
 #include <vulkan/vulkan.hpp>
+#include <vulkan/vulkan_core.h>
 
 #include "context.hh"
 
@@ -17,7 +18,8 @@ class PhysicalDeviceContext final : public Context {
     static constexpr auto required_extensions = {
         VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME,
         VK_KHR_CALIBRATED_TIMESTAMPS_EXTENSION_NAME,
-        VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME};
+        VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME,
+        VK_KHR_PRESENT_ID_EXTENSION_NAME};
 
   public:
     InstanceContext& instance;
@@ -29,8 +31,8 @@ class PhysicalDeviceContext final : public Context {
     using queue_properties_t = std::vector<VkQueueFamilyProperties2>;
     std::unique_ptr<const queue_properties_t> queue_properties;
 
-    // Will be set to true in the constructor if the physical device supports
-    // everything we need to track gpu timing data.
+    // Will be true if the physical device supports everything in
+    // this->required_extensions.
     bool supports_required_extensions = false;
 
   public:
diff --git a/src/queue_context.cc b/src/queue_context.cc
index d12f03d..30e73c1 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -3,11 +3,6 @@
 #include "layer_context.hh"
 #include "timestamp_pool.hh"
 
-#include <algorithm>
-#include <chrono>
-#include <fstream>
-#include <iostream>
-#include <ranges>
 #include <span>
 #include <vulkan/vulkan_core.h>
 
@@ -49,333 +44,52 @@ QueueContext::QueueContext(DeviceContext& device_context, const VkQueue& queue,
 }
 
 QueueContext::~QueueContext() {
-    this->in_flight_frames.clear();
-    this->submissions.clear();
+    this->unpresented_submissions.clear();
     this->timestamp_pool.reset();
 }
 
 void QueueContext::notify_submit(
-    const VkSubmitInfo& info,
+    const present_id_t& present_id,
     const std::shared_ptr<TimestampPool::Handle> head_handle,
     const std::shared_ptr<TimestampPool::Handle> tail_handle,
-    const DeviceContext::Clock::time_point_t& now) {
-
-    auto signals = std::unordered_set<VkSemaphore>{};
-    auto waits = std::unordered_set<VkSemaphore>{};
-    std::ranges::copy(std::span{info.pWaitSemaphores, info.waitSemaphoreCount},
-                      std::inserter(waits, std::end(waits)));
-    std::ranges::copy(
-        std::span{info.pSignalSemaphores, info.signalSemaphoreCount},
-        std::inserter(signals, std::end(signals)));
-
-    this->submissions.emplace_back(std::make_unique<Submission>(
-        std::move(signals), std::move(waits), head_handle, tail_handle, now));
-
-    if (std::size(this->submissions) > this->MAX_TRACKED_SUBMISSIONS) {
-        this->submissions.pop_front();
-    }
-}
-
-// Identical to notify_submit, but we use VkSubmitInfo2.
-void QueueContext::notify_submit(
-    const VkSubmitInfo2& info,
-    const std::shared_ptr<TimestampPool::Handle> head_handle,
-    const std::shared_ptr<TimestampPool::Handle> tail_handle,
-    const DeviceContext::Clock::time_point_t& now) {
-
-    auto signals = std::unordered_set<VkSemaphore>{};
-    auto waits = std::unordered_set<VkSemaphore>{};
-
-    std::ranges::transform(
-        std::span{info.pWaitSemaphoreInfos, info.waitSemaphoreInfoCount},
-        std::inserter(waits, std::end(waits)),
-        [](const auto& info) -> auto { return info.semaphore; });
-
-    std::ranges::transform(
-        std::span{info.pSignalSemaphoreInfos, info.signalSemaphoreInfoCount},
-        std::inserter(signals, std::end(signals)),
-        [](const auto& info) -> auto { return info.semaphore; });
-
-    this->submissions.emplace_back(std::make_unique<Submission>(
-        std::move(signals), std::move(waits), head_handle, tail_handle, now));
-
-    if (std::size(this->submissions) > this->MAX_TRACKED_SUBMISSIONS) {
-        this->submissions.pop_front();
-    }
-}
-
-void QueueContext::drain_submissions_to_frame() {
-
-    // We are going to assume that all queue submissions before and on the same
-    // queue contribute to the frame.
-
-    // This used to be more complicated where we found the first submission that
-    // was signalled by acquire, then we walked forwards until we found the
-    // submission before it that marked the end of frame (which was the last
-    // submission in the previous frame that called notify submit). This seemed
-    // completely redundant, in all cases it was exactly what we have here. But
-    // I could be wrong.
-
-    const auto start_iter = std::begin(this->submissions);
-    // no op submit?
-    if (start_iter == std::end(this->submissions)) {
-        return;
+    const DeviceClock::time_point_t& now) {
+
+    // Push this submission onto our unpresented_submissions at our present_id
+    // mapping (might be empty, but handled with operator[]).
+    auto& submissions = this->unpresented_submissions[present_id];
+    if (submissions == nullptr) {
+        submissions =
+            std::make_shared<std::deque<std::unique_ptr<Submission>>>();
     }
 
-    // The last submission is either in flight, already processed, or we
-    // just happen to be the first frame and we can just set it to our start
-    // with little consequence.
-    const auto prev_frame_last_submit = [&]() -> auto {
-        if (const auto iter = std::rbegin(this->in_flight_frames);
-            iter != std::rend(this->in_flight_frames)) {
-
-            assert(!iter->submissions.empty());
-            return iter->submissions.back();
-        }
-
-        if (const auto iter = std::rbegin(this->timings);
-            iter != std::rend(this->timings)) {
+    submissions->push_back(
+        std::make_unique<Submission>(Submission{.head_handle = head_handle,
+                                                .tail_handle = tail_handle,
+                                                .cpu_present_time = now}));
 
-            const auto& submissions = (*iter)->frame.submissions;
-            assert(!submissions.empty());
-
-            return submissions.back();
-        }
-
-        return *start_iter;
-    }();
-
-    this->in_flight_frames.emplace_back(
-        Frame{.submissions = std::move(this->submissions),
-              .cpu_post_present_time = DeviceContext::Clock::now()});
-    assert(std::size(this->in_flight_frames.back().submissions));
-    // *valid but unspecified state after move, so clear!*
-    this->submissions.clear();
-}
-
-void QueueContext::notify_present(const VkPresentInfoKHR& info) {
-    this->drain_submissions_to_frame();
-    this->drain_frames_to_timings();
-
-    // We should only sleep in present if two conditions are met:
-    //     1. Our antilag_mode isn't set to on, because otherwise the sleep will
-    //        be done in input and with far better results.
-    //     2. The 'is_antilag_1_enabled' flag, which exists at the layer's
-    //        context, is set.
-    //        
-    /*
-     * WIP REFLEX
-    if (this->device_context.antilag_mode != VK_ANTI_LAG_MODE_ON_AMD &&
-        this->device_context.instance.layer.is_antilag_1_enabled) {
-
-        this->sleep_in_present();
+    // This is probably hit if our queue never actually presents to anything,
+    // because the only time we manually evict our unpresent_submissions is
+    // when we present to something.
+    if (std::size(*submissions) > this->MAX_TRACKED_SUBMISSIONS) {
+        submissions->pop_front();
     }
-    */
 }
 
-const auto debug_log_time2 = [](auto& stream, const auto& diff) {
-    using namespace std::chrono;
-    const auto ms = duration_cast<milliseconds>(diff);
-    const auto us = duration_cast<microseconds>(diff - ms);
-    const auto ns = duration_cast<nanoseconds>(diff - ms - us);
-    stream << ms << " " << us << " " << ns << " ago\n";
-};
+void QueueContext::notify_present(const VkSwapchainKHR& swapchain,
+                                  const present_id_t& present_id) {
 
-void QueueContext::drain_frames_to_timings() {
-    if (!std::size(this->in_flight_frames)) {
-        return;
+    // Notify the device that this swapchain was just presented to.
+    // We're avoiding a double hash here - don't use operator[] and erase.
+    auto iter = this->unpresented_submissions.try_emplace(present_id).first;
+    if (iter->second == nullptr) {
+        iter->second =
+            std::make_shared<std::deque<std::unique_ptr<Submission>>>();
     }
 
-    // Only need to calibrate this device, we don't support multi device anti
-    // lag.
-    this->device_context.clock->calibrate();
-
-    while (std::size(this->in_flight_frames)) {
-        const auto& frame = this->in_flight_frames.front();
-
-        assert(std::size(frame.submissions));
-
-        const auto& last_submission = frame.submissions.back();
-
-        // Not completed (so future frames definitely aren't) - stop early.
-        if (!last_submission->end_handle->get_time().has_value()) {
-            break;
-        }
-
-        // We are committed to removing the frame at this stage and
-        // promoting it to a 'timing' struct because it's completed.
-        // We can guarantee that we can extract timing information from
-        // all start/end handles now.
-
-        // Using leetcode merge intervals in the wild lol
-        struct Interval {
-            DeviceContext::Clock::time_point_t start, end;
-        };
-
-        const auto sorted_intervals = [&]() -> auto {
-            auto intervals = std::vector<Interval>{};
-            std::ranges::transform(
-                frame.submissions, std::back_inserter(intervals),
-                [&](const auto& submission) {
-                    return Interval{
-                        .start = submission->start_handle->get_time_required(),
-                        .end = submission->end_handle->get_time_required(),
-                    };
-                });
-
-            std::ranges::sort(intervals, [](const auto& a, const auto& b) {
-                return a.start < b.start;
-            });
-            return intervals;
-        }();
-
-        const auto merged = [&]() -> auto {
-            auto merged = std::vector<Interval>{};
-            auto last = sorted_intervals[0];
-
-            for (const auto& [s, e] : sorted_intervals | std::views::drop(1)) {
-                if (s <= last.end) {
-                    last.end = std::max(last.end, e);
-                } else {
-                    merged.push_back(last);
-                    last = {s, e};
-                }
-            }
-            merged.push_back(last);
-            return merged;
-        }();
-
-        // It's important to note that gputime starts from a point which isn't
-        // equal to the below 'start' var. It looks something like this, where a
-        // '-' represents CPU time only and '=' represents CPU + GPU.
-        //
-        //   |---------------------|=========|--------|====|-----------------|
-        //   ^ last_present        ^ merged.front().start            present ^
-        //                               merged.back().end ^
-        //
-        // I would imagine there would be more GPU than cpu to reach the anti
-        // lag codepath than is depicted here. We can track the total time
-        // between vkPresent calls as future_submit - last_submit. The total
-        // time the GPU spent engaged is the sum of all intervals. So we can
-        // get a meaningful 'not_gputime' as total - gpu_time.
-
-        const auto gputime = std::ranges::fold_left(
-            merged, DeviceContext::Clock::time_point_t::duration{},
-            [](auto gputime, const auto& interval) {
-                const auto& [start, end] = interval;
-                return gputime + (end - start);
-            });
-
-        // Our cpu_start value here refers to the time when the CPU was allowed
-        // to move past the present call and, in theory, begin cpu work on the
-        // next frame.
-        const auto cpu_start = [&]() -> auto {
-            if (const auto it = std::rbegin(this->timings);
-                it != std::rend(this->timings)) {
-
-                return (*it)->frame.cpu_post_present_time;
-            }
-            // This will happen once, only for the first frame. We don't
-            // have a way of knowing when the CPU first started work here.
-            // Just return our first submit's start for this edge case.
-            return frame.submissions.front()->start_handle->get_time_required();
-        }();
-
-        const auto cputime =
-            frame.submissions.front()->enqueued_time - cpu_start;
-
-        this->timings.emplace_back(std::make_unique<Timing>(Timing{
-            .gputime = gputime,
-            .cputime = cputime,
-            .frame = frame,
-        }));
-
-        this->in_flight_frames.pop_front();
-    }
-
-    if (const auto T = std::size(this->timings);
-        T > this->MAX_TRACKED_TIMINGS) {
-
-        const auto erase_to_iter =
-            std::next(std::begin(this->timings),
-                      static_cast<long>(T - MAX_TRACKED_TIMINGS));
-        this->timings.erase(std::begin(this->timings), erase_to_iter);
-    }
-}
-
-void QueueContext::sleep_in_present() {
-    // After calling this, any remaining frames are truly in flight.
-    this->drain_frames_to_timings();
-    if (!std::size(this->in_flight_frames)) {
-        return;
-    }
-
-    // This is getting the most recent frame and waiting until its start has
-    // begun. This means that, in the case of >1 frame in flight, it's draining
-    // all of them before we're allowed to move forward.
-    const auto first_gpu_work = [&]() -> auto {
-        const auto& most_recent_frame = this->in_flight_frames.back();
-        const auto& first_submission = most_recent_frame.submissions.front();
-        return first_submission->start_handle->get_time_spinlock();
-    }();
-
-    // Drain frames again because as stated above, we might have multiple frames
-    // now completed after our wait spinlock.
-    this->drain_frames_to_timings();
-
-    // Check the size again because the frame we want to target may have already
-    // completed when we called process_frames().
-    if (!std::size(this->in_flight_frames)) {
-        return;
-    }
-    assert(std::size(this->in_flight_frames) == 1);
-
-    // Not enough data yet to apply any delays.
-    if (std::size(this->timings) < this->MAX_TRACKED_TIMINGS) {
-        return;
-    }
-
-    const auto calc_median = [&, this](const auto& getter) {
-        auto vect = std::vector<Timing*>{};
-        std::ranges::transform(this->timings, std::back_inserter(vect),
-                               [](const auto& timing) { return timing.get(); });
-        std::ranges::sort(vect, [&](const auto& a, const auto& b) {
-            return getter(a) < getter(b);
-        });
-        return getter(vect[std::size(vect) / 2]);
-    };
-
-    const auto expected_gputime =
-        calc_median([](const auto& timing) { return timing->gputime; });
-    const auto expected_cputime =
-        calc_median([](const auto& timing) { return timing->cputime; });
-
-    // Should look like this:
-    //              total_length = expected_gputime
-    // |------------------------x------------------------------|
-    // ^ first_gpu_work        now               last_gpu_work ^
-
-    const auto now = DeviceContext::Clock::now();
-    const auto dist = now - first_gpu_work;
-    const auto expected_dist_to_last = expected_gputime - dist;
-
-    const auto wait_time = expected_dist_to_last - expected_cputime;
-
-    auto& frame = this->in_flight_frames.back();
-    const auto& last_gpu_work = frame.submissions.back()->end_handle;
-    last_gpu_work->get_time_spinlock(now + wait_time);
-
-    frame.cpu_post_present_time = std::chrono::steady_clock::now();
+    this->device_context.notify_present(swapchain, iter->second);
 
-    std::ofstream f("/tmp/times.txt", std::ios::trunc);
-    f << "    expected gputime: ";
-    debug_log_time2(f, expected_gputime);
-    f << "    expected cputime: ";
-    debug_log_time2(f, expected_cputime);
-    f << "    requestd sleep: ";
-    debug_log_time2(f, wait_time);
-    f << "    observed sleep: ";
-    debug_log_time2(f, frame.cpu_post_present_time - now);
+    // Important, we nuke the submission because now it's presented.
+    this->unpresented_submissions.erase(iter);
 }
 
 bool QueueContext::should_inject_timestamps() const {
@@ -385,9 +99,9 @@ bool QueueContext::should_inject_timestamps() const {
         return false;
     }
 
-    // Don't bother injecting timestamps during queue submission if both AL1 and
-    // AL2 are disabled.
-    if (!this->device_context.was_antilag_requested &&
+    // Don't bother injecting timestamps during queue submission if we
+    // aren't planning on doing anything anyway.
+    if (!this->device_context.was_capability_requested &&
         !physical_device.instance.layer.is_antilag_1_enabled) {
 
         return false;
diff --git a/src/queue_context.hh b/src/queue_context.hh
index 221626f..48500e1 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -2,33 +2,23 @@
 #define QUEUE_STATE_HH_
 
 #include "context.hh"
-#include "device_context.hh"
+#include "device_clock.hh"
 #include "timestamp_pool.hh"
 
 #include <vulkan/utility/vk_dispatch_table.h>
 #include <vulkan/vulkan.hpp>
 
-#include <chrono>
 #include <deque>
 #include <memory>
-#include <unordered_set>
+#include <unordered_map>
 
 namespace low_latency {
 
 class QueueContext final : public Context {
   private:
-    // The amount of finished frame timing data we keep before eviction.
-    // For now, this value is also the number of data points used in the
-    // calculation of gpu timing information.
-    static constexpr auto MAX_TRACKED_TIMINGS = 50u;
     // The amount of queue submissions we allow tracked per queue before
-    // we give up tracking them. For a queue that is presented to,
-    // these submissions will be constantly moved to Frame structs so
-    // it's not an issue that we only track so many - unless it just
-    // happens that an application makes an unexpectedly large
-    // amount of vkQueueSubmit's per frame. For queues which don't
-    // present, this limit stops them from growing limitlessly in memory
-    // as we may not necessarily manually evict them yet.
+    // we give up tracking them. This is neccessary for queues which do not
+    // present anything.
     static constexpr auto MAX_TRACKED_SUBMISSIONS = 50u;
 
   public:
@@ -59,55 +49,35 @@ class QueueContext final : public Context {
 
     // NVIDIA's extension lets the application explicitly state that this queue
     // does not contribute to the frame. AMD's extension has no such mechanism -
-    // so this will always be false.
+    // so this will always be false when using VK_AMD_anti_lag.
     bool should_ignore_latency = false;
 
   public:
-    // Potentially in flight queue submissions that come from this queue.
+    // I want our queue bookkeeping to be fairly simple and do one thing - track
+    // submissions that have yet to have been presented to a swapchain. General
+    // idea:
+    //
+    // For each vkQueueSubmit (specifically for each pSubmitInfo in that
+    // hook) grab the VK_EXT_present_id value provided by the application for
+    // that submission. Once we add our timing objects as part of the hook, we
+    // then take those timing objects, bundle them into a Submission struct, and
+    // append it to the (potentially currently nonexistent) mapping of
+    // present_id's to deque<Submission>'s. Now we cleanly track what queue
+    // submissions refer to what present_id.
+    //
+    // When our hook sees a VkQueuePresentKHR, we take the provided present_id
+    // and notify our device that it needs to watch for when this completes.
+    // We give it our submission. Now, it's out of our hands. We remove the
+    // present_id_t mapping when doing so.
     struct Submission {
-        const std::unordered_set<VkSemaphore> signals;
-        const std::unordered_set<VkSemaphore> waits;
-
-        const std::shared_ptr<TimestampPool::Handle> start_handle;
-        const std::shared_ptr<TimestampPool::Handle> end_handle;
-
-        const DeviceContext::Clock::time_point_t enqueued_time;
-    };
-    using submission_ptr_t = std::shared_ptr<Submission>;
-    std::deque<submission_ptr_t> submissions;
-
-    // In flight frame submissions grouped together.
-    // The first element in the deque refers to the first submission that
-    // contributed to that frame. The last element is the last submission before
-    // present was called.
-    // std::size(submissions) >= 1 btw
-    struct Frame {
-        std::deque<submission_ptr_t> submissions;
-
-        // the point that control flow was returned from VkQueuePresentKHR back
-        // to the application.
-        DeviceContext::Clock::time_point_t cpu_post_present_time;
-    };
-    std::deque<Frame> in_flight_frames;
-
-    // Completed frames.
-    struct Timing {
-        DeviceContext::Clock::time_point_t::duration gputime, cputime;
-
-        Frame frame;
+        std::shared_ptr<TimestampPool::Handle> head_handle, tail_handle;
+        DeviceClock::time_point_t cpu_present_time;
     };
-    std::deque<std::unique_ptr<Timing>> timings;
 
-  private:
-    // Drains submissions and promotes them into a single frame object.
-    void drain_submissions_to_frame();
-
-    // Drains in flight frames and promotes them into a Timing object if they
-    // have completed.
-    void drain_frames_to_timings();
-
-    // Antilag 1 equivalent where we sleep after present to reduce queueing.
-    void sleep_in_present();
+    using submissions_t =
+        std::shared_ptr<std::deque<std::unique_ptr<Submission>>>;
+    using present_id_t = std::uint64_t;
+    std::unordered_map<present_id_t, submissions_t> unpresented_submissions;
 
   public:
     QueueContext(DeviceContext& device_context, const VkQueue& queue,
@@ -115,17 +85,13 @@ class QueueContext final : public Context {
     virtual ~QueueContext();
 
   public:
-    void notify_submit(const VkSubmitInfo& info,
-                       const std::shared_ptr<TimestampPool::Handle> head_handle,
-                       const std::shared_ptr<TimestampPool::Handle> tail_handle,
-                       const DeviceContext::Clock::time_point_t& now);
-
-    void notify_submit(const VkSubmitInfo2& info,
+    void notify_submit(const present_id_t& present_id,
                        const std::shared_ptr<TimestampPool::Handle> head_handle,
                        const std::shared_ptr<TimestampPool::Handle> tail_handle,
-                       const DeviceContext::Clock::time_point_t& now);
+                       const DeviceClock::time_point_t& now);
 
-    void notify_present(const VkPresentInfoKHR& info);
+    void notify_present(const VkSwapchainKHR& swapchain,
+                        const std::uint64_t& present_id);
 
   public:
     bool should_inject_timestamps() const;
diff --git a/src/swapchain_monitor.cc b/src/swapchain_monitor.cc
new file mode 100644
index 0000000..09fa8ba
--- /dev/null
+++ b/src/swapchain_monitor.cc
@@ -0,0 +1,112 @@
+#include "swapchain_monitor.hh"
+#include "device_context.hh"
+
+#include <vulkan/vulkan_core.h>
+
+#include <functional>
+#include <mutex>
+
+namespace low_latency {
+
+SwapchainMonitor::SwapchainMonitor(const DeviceContext& device,
+                                   const bool was_low_latency_requested)
+    : device(device), was_low_latency_requested(was_low_latency_requested),
+      swapchain_worker(
+          std::bind_front(&SwapchainMonitor::do_swapchain_monitor, this)) {}
+
+SwapchainMonitor::~SwapchainMonitor() {}
+
+void SwapchainMonitor::WakeupSemaphore::signal(
+    const DeviceContext& device) const {
+
+    const auto ssi =
+        VkSemaphoreSignalInfo{.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO,
+                              .semaphore = this->timeline_semaphore,
+                              .value = this->value};
+    THROW_NON_VKSUCCESS(device.vtable.SignalSemaphore(device.device, &ssi));
+}
+
+void SwapchainMonitor::do_swapchain_monitor(const std::stop_token stoken) {
+    for (;;) {
+        auto lock = std::unique_lock{this->mutex};
+        this->cv.wait(lock, stoken,
+                      [&]() { return !this->wakeup_semaphores.empty(); });
+
+        if (stoken.stop_requested()) {
+            // Small chance an application might need outstanding semaphores
+            // to be signalled if it's closing to avoid a hang.
+            break;
+        }
+
+        // Look for the latest submission and make sure it's completed.
+        if (!this->in_flight_submissions.empty()) {
+            const auto submission = this->in_flight_submissions.back();
+            this->in_flight_submissions.clear();
+
+            if (!submission->empty()) {
+                submission->back()->tail_handle->await_time();
+            }
+        }
+
+        // We might want to signal them all? In theory it's the same timeline
+        // semaphore so obviously it's redundant to signal them one by one. In
+        // almost all cases, there should just be one here anyway.
+        const auto wakeup_semaphore = this->wakeup_semaphores.back();
+        wakeup_semaphores.clear();
+
+        wakeup_semaphore.signal(this->device);
+    }
+}
+
+void SwapchainMonitor::update_params(
+    const bool was_low_latency_requested,
+    const std::chrono::milliseconds present_delay) {
+
+    const auto lock = std::scoped_lock{this->mutex};
+
+    this->was_low_latency_requested = was_low_latency_requested;
+    this->present_delay = present_delay;
+}
+
+void SwapchainMonitor::notify_semaphore(const VkSemaphore& timeline_semaphore,
+                                        const std::uint64_t& value) {
+
+    const auto lock = std::scoped_lock{this->mutex};
+
+    const auto wakeup_semaphore = WakeupSemaphore{
+        .timeline_semaphore = timeline_semaphore, .value = value};
+
+    // Signal immediately if low_latency isn't requested or if we have no
+    // outstanding work.
+    if (!this->was_low_latency_requested ||
+        this->in_flight_submissions.empty()) {
+
+        wakeup_semaphore.signal(this->device);
+        return;
+    }
+
+    this->wakeup_semaphores.emplace_back(timeline_semaphore, value);
+    this->cv.notify_one();
+}
+
+void SwapchainMonitor::notify_present(
+    const QueueContext::submissions_t& submissions) {
+
+    const auto lock = std::scoped_lock{this->mutex};
+
+    // Fast path where this work has already completed.
+    if (!this->wakeup_semaphores.empty() && !submissions->empty()) {
+
+        const auto& finished = submissions->back()->tail_handle->get_time();
+        if (finished.has_value()) {
+            this->wakeup_semaphores.back().signal(this->device);
+            this->wakeup_semaphores.clear();
+            return;
+        }
+    }
+
+    this->in_flight_submissions.emplace_back(submissions);
+    this->cv.notify_one();
+}
+
+} // namespace low_latency
+\ No newline at end of file
diff --git a/src/swapchain_monitor.hh b/src/swapchain_monitor.hh
new file mode 100644
index 0000000..5678630
--- /dev/null
+++ b/src/swapchain_monitor.hh
@@ -0,0 +1,69 @@
+#ifndef SWAPCHAIN_MONITOR_HH_
+#define SWAPCHAIN_MONITOR_HH_
+
+// The purpose of this file is to provide a SwapchainMonitor class definition.
+
+#include <vulkan/vulkan_core.h>
+
+#include <chrono>
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <thread>
+
+#include "queue_context.hh"
+
+namespace low_latency {
+
+class DeviceContext;
+
+// A swapchain monitor's job is to provide asynchronous wakeups for threads
+// which request low_latency once the previous presentation has completed.
+// It does this by signalling a semaphore a la VK_NV_low_latency2.
+class SwapchainMonitor {
+  private:
+    const DeviceContext& device;
+
+    // Configurarable params for this swapchain.
+    std::chrono::milliseconds present_delay = std::chrono::milliseconds{0};
+    bool was_low_latency_requested = false;
+
+    struct WakeupSemaphore {
+        VkSemaphore timeline_semaphore;
+        std::uint64_t value;
+
+      public:
+        void signal(const DeviceContext& device) const;
+    };
+    std::deque<WakeupSemaphore> wakeup_semaphores;
+    std::deque<QueueContext::submissions_t> in_flight_submissions;
+
+    std::mutex mutex;
+    std::condition_variable_any cv;
+    std::jthread swapchain_worker;
+
+  private:
+    void do_swapchain_monitor(const std::stop_token stoken);
+
+  public:
+    SwapchainMonitor(const DeviceContext& device,
+                     const bool was_low_latency_requested);
+    SwapchainMonitor(const SwapchainMonitor&);
+    SwapchainMonitor(SwapchainMonitor&&);
+    SwapchainMonitor operator=(const SwapchainMonitor&);
+    SwapchainMonitor operator=(SwapchainMonitor&&);
+    ~SwapchainMonitor();
+
+  public:
+    void update_params(const bool was_low_latency_requested,
+                       const std::chrono::milliseconds present_delay);
+
+    void notify_semaphore(const VkSemaphore& timeline_semaphore,
+                          const std::uint64_t& value);
+
+    void notify_present(const QueueContext::submissions_t& submissions);
+};
+
+} // namespace low_latency
+
+#endif
+\ No newline at end of file
diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc
index 247d411..a37b2bc 100644
--- a/src/timestamp_pool.cc
+++ b/src/timestamp_pool.cc
@@ -152,19 +152,18 @@ void TimestampPool::Handle::setup_command_buffers(
     THROW_NON_VKSUCCESS(vtable.EndCommandBuffer(tail.command_buffer));
 }
 
-std::optional<DeviceContext::Clock::time_point_t>
-TimestampPool::Handle::get_time() {
-    const auto& device_ctx = this->timestamp_pool.queue_context.device_context;
-    const auto& vtable = device_ctx.vtable;
+struct QueryResult {
+    std::uint64_t value;
+    std::uint64_t available;
+};
+std::optional<DeviceClock::time_point_t> TimestampPool::Handle::get_time() {
+    const auto& context = this->timestamp_pool.queue_context.device_context;
+    const auto& vtable = context.vtable;
 
-    struct QueryResult {
-        std::uint64_t value;
-        std::uint64_t available;
-    };
     auto query_result = QueryResult{};
 
     const auto result = vtable.GetQueryPoolResults(
-        device_ctx.device, query_pool,
+        context.device, query_pool,
         static_cast<std::uint32_t>(this->query_index), 1, sizeof(query_result),
         &query_result, sizeof(query_result),
         VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT);
@@ -177,30 +176,31 @@ TimestampPool::Handle::get_time() {
         return std::nullopt;
     }
 
-    return device_ctx.clock->ticks_to_time(query_result.value);
+    return context.clock->ticks_to_time(query_result.value);
 }
 
-std::optional<DeviceContext::Clock::time_point_t>
-TimestampPool::Handle::get_time_spinlock(
-    const DeviceContext::Clock::time_point_t& until) {
+DeviceClock::time_point_t TimestampPool::Handle::await_time() {
+    const auto& context = this->timestamp_pool.queue_context.device_context;
+    const auto& vtable = context.vtable;
 
-    auto time = this->get_time();
-    for (; !time.has_value(); time = this->get_time()) {
-        if (const auto now = DeviceContext::Clock::now(); now >= until) {
-            break;
-        }
-    }
-    return time;
-}
+    struct QueryResult {
+        std::uint64_t value;
+        std::uint64_t available;
+    };
+    auto query_result = QueryResult{};
 
-DeviceContext::Clock::time_point_t TimestampPool::Handle::get_time_spinlock() {
-    constexpr auto max = DeviceContext::Clock::time_point_t::max();
-    const auto time = this->get_time_spinlock(max);
-    assert(time.has_value());
-    return *time;
+    THROW_NON_VKSUCCESS(vtable.GetQueryPoolResults(
+        context.device, query_pool,
+        static_cast<std::uint32_t>(this->query_index), 1, sizeof(query_result),
+        &query_result, sizeof(query_result),
+        VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
+            VK_QUERY_RESULT_WAIT_BIT));
+    assert(query_result.available);
+
+    return context.clock->ticks_to_time(query_result.value);
 }
 
-DeviceContext::Clock::time_point_t TimestampPool::Handle::get_time_required() {
+DeviceClock::time_point_t TimestampPool::Handle::get_time_required() {
     const auto time = this->get_time();
     assert(time.has_value());
     return *time;
diff --git a/src/timestamp_pool.hh b/src/timestamp_pool.hh
index 67b34de..0d6c52d 100644
--- a/src/timestamp_pool.hh
+++ b/src/timestamp_pool.hh
@@ -18,11 +18,12 @@
 #include <unordered_set>
 #include <vector>
 
-#include "device_context.hh"
+#include "device_clock.hh"
 
 namespace low_latency {
 
 class QueueContext;
+class DeviceContext;
 
 class TimestampPool final {
   private:
@@ -119,20 +120,15 @@ class TimestampPool final {
         void setup_command_buffers(const Handle& tail,
                                    const QueueContext& queue_context) const;
 
-        // Attempts to get_time, but returns an optional if it's not available
-        // yet.
-        std::optional<DeviceContext::Clock::time_point_t> get_time();
-
-        // Calls get_time() repeatedly under a spinlock, or gives up at
-        // time_point_t and returns std::nullopt.
-        std::optional<DeviceContext::Clock::time_point_t>
-        get_time_spinlock(const DeviceContext::Clock::time_point_t& until);
+      public:
+        // Attempts to get the time - optional if it's not available yet.
+        std::optional<DeviceClock::time_point_t> get_time();
 
-        // Calls get_time() repeatedly under a spinlock until it's available.
-        DeviceContext::Clock::time_point_t get_time_spinlock();
+        // Waits until the time is available and returns it.
+        DeviceClock::time_point_t await_time();
 
         // Calls get_time with the assumption it's already available.
-        DeviceContext::Clock::time_point_t get_time_required();
+        DeviceClock::time_point_t get_time_required();
     };
 
   public:
author	Nicolas James <nj3ahxac@gmail.com>	2026-03-29 20:44:23 +1100
committer	Nicolas James <nj3ahxac@gmail.com>	2026-03-29 20:44:23 +1100
commit	681bd5096ee416b50dd7338de30af7b3db385a36 (patch)
tree	358b6bc6f9a3af66729b8ac3b15dd38cc0f4bd2a
parent	d5ef2dbbd77c69dd93e92d5b7046a65c2361b59b (diff)