8 files changed, 88 insertions, 50 deletions
diff --git a/src/context.hh b/src/context.hh
index 5972740..91fbf91 100644
--- a/src/context.hh
+++ b/src/context.hh
@@ -1,13 +1,20 @@
 #ifndef CONTEXT_HH_
 #define CONTEXT_HH_
 
-// The purpose of this class is to provide a base class for Context classes.
-
 namespace low_latency {
 
+// A context class doesn't do much by itself. We just use it to provide a
+// virtual destructor so we can store a bunch of shared_ptrs in the same
+// container and rely on RTTI in the layer context. It also deletes the copy and
+// move constructors for derived classes implicitly, and that's pretty much it.
+//
+// We _could_ do something weird and complicated where we define virtual pure
+// hashing and equality functions so we can store them in an unordered_set, but
+// it's just unnecessary complexity and doesn't allow us to perform 'do you exist'
+// lookups without creating an object.
 class Context {
-    
-public:
+
+  public:
     Context();
     Context(const Context& context) = delete;
     Context(Context&& context) = delete;
diff --git a/src/device_context.cc b/src/device_context.cc
index 49b7808..97103de 100644
--- a/src/device_context.cc
+++ b/src/device_context.cc
@@ -134,9 +134,11 @@ void DeviceContext::notify_antilag_update(const VkAntiLagDataAMD& data) {
         return;
     }
 
-    if (this->antilag_mode == VK_ANTI_LAG_MODE_ON_AMD) {
-        this->sleep_in_input();
+    if (this->antilag_mode != VK_ANTI_LAG_MODE_ON_AMD) {
+        return;
     }
+
+    this->sleep_in_input();
 }
 
 void DeviceContext::notify_queue_present(const QueueContext& queue) {
diff --git a/src/instance_context.cc b/src/instance_context.cc
index d12766f..5a4d48a 100644
--- a/src/instance_context.cc
+++ b/src/instance_context.cc
@@ -5,9 +5,10 @@
 
 namespace low_latency {
 
-InstanceContext::InstanceContext(const VkInstance& instance,
+InstanceContext::InstanceContext(const LayerContext& parent_context,
+                                 const VkInstance& instance,
                                  VkuInstanceDispatchTable&& vtable)
-    : instance(instance), vtable(std::move(vtable)) {}
+    : layer(parent_context), instance(instance), vtable(std::move(vtable)) {}
 
 InstanceContext::~InstanceContext() {
     // Similar to devices, we should own the only shared ptr at this point so
diff --git a/src/instance_context.hh b/src/instance_context.hh
index 3b71a82..001cde8 100644
--- a/src/instance_context.hh
+++ b/src/instance_context.hh
@@ -10,17 +10,22 @@
 
 namespace low_latency {
 
+class LayerContext;
 class PhysicalDeviceContext;
 
 struct InstanceContext final : public Context {
 
+    const LayerContext& layer;
+
     const VkInstance instance;
     const VkuInstanceDispatchTable vtable;
 
-    std::unordered_map<void*, std::shared_ptr<PhysicalDeviceContext>> phys_devices;
+    std::unordered_map<void*, std::shared_ptr<PhysicalDeviceContext>>
+        phys_devices;
 
   public:
-    InstanceContext(const VkInstance& instance,
+    InstanceContext(const LayerContext& parent_context,
+                    const VkInstance& instance,
                     VkuInstanceDispatchTable&& vtable);
     virtual ~InstanceContext();
 };
diff --git a/src/layer.cc b/src/layer.cc
index d2977b7..3600a47 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -28,15 +28,15 @@ LayerContext layer_context;
 
 } // namespace
 
+// Small templates which allow us to SFINAE find pNext structs.
 template <typename T>
 static T* find_next(void* const head, const VkStructureType& stype) {
-    for (auto i = reinterpret_cast<VkBaseOutStructure*>(head); i;
+    for (auto i = reinterpret_cast<VkBaseOutStructure*>(head)->pNext; i;
          i = i->pNext) {
 
-        if (i->sType != stype) {
-            continue;
+        if (i->sType == stype) {
+            return reinterpret_cast<T*>(i);
         }
-        return reinterpret_cast<T*>(i);
     }
     return nullptr;
 }
@@ -44,13 +44,13 @@ static T* find_next(void* const head, const VkStructureType& stype) {
 template <typename T>
 static const T* find_next(const void* const head,
                           const VkStructureType& stype) {
-    for (auto i = reinterpret_cast<const VkBaseInStructure*>(head); i;
+
+    for (auto i = reinterpret_cast<const VkBaseInStructure*>(head)->pNext; i;
          i = i->pNext) {
 
-        if (i->sType != stype) {
-            continue;
+        if (i->sType == stype) {
+            return reinterpret_cast<const T*>(i);
         }
-        return reinterpret_cast<const T*>(i);
     }
     return nullptr;
 }
@@ -59,12 +59,11 @@ template <typename T>
 static const T* find_link(const void* const head,
                           const VkStructureType& stype) {
     for (auto info = find_next<T>(head, stype); info;
-         info = find_next<T>(info->pNext, stype)) {
+         info = find_next<T>(info, stype)) {
 
-        if (info->function != VK_LAYER_LINK_INFO) {
-            continue;
+        if (info->function == VK_LAYER_LINK_INFO) {
+            return reinterpret_cast<const T*>(info);
         }
-        return reinterpret_cast<const T*>(info);
     }
     return nullptr;
 }
@@ -74,7 +73,7 @@ CreateInstance(const VkInstanceCreateInfo* pCreateInfo,
                const VkAllocationCallbacks* pAllocator, VkInstance* pInstance) {
 
     const auto link_info = find_link<VkLayerInstanceCreateInfo>(
-        pCreateInfo->pNext, VK_STRUCTURE_TYPE_LOADER_INSTANCE_CREATE_INFO);
+        pCreateInfo, VK_STRUCTURE_TYPE_LOADER_INSTANCE_CREATE_INFO);
 
     if (!link_info || !link_info->u.pLayerInfo) {
         return VK_ERROR_INITIALIZATION_FAILED;
@@ -122,7 +121,8 @@ CreateInstance(const VkInstanceCreateInfo* pCreateInfo,
     assert(!layer_context.contexts.contains(key));
 
     layer_context.contexts.try_emplace(
-        key, std::make_shared<InstanceContext>(*pInstance, std::move(vtable)));
+        key, std::make_shared<InstanceContext>(layer_context, *pInstance,
+                                               std::move(vtable)));
 
     return VK_SUCCESS;
 }
@@ -182,16 +182,18 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
     VkPhysicalDevice physical_device, const VkDeviceCreateInfo* pCreateInfo,
     const VkAllocationCallbacks* pAllocator, VkDevice* pDevice) {
 
+    const auto enabled_extensions =
+        std::span{pCreateInfo->ppEnabledExtensionNames,
+                  pCreateInfo->enabledExtensionCount};
+
     // Hook logic after create device looks like this.
     // !PHYS_SUPPORT &&  AL2_REQUESTED -> return INITIALIZATION_FAILED here.
     // !PHYS_SUPPORT && !AL2_REQUESTED -> hooks are no-ops
     //  PHYS_SUPPORT                   -> hooks inject timestamps regardless
     //                                    because AL1 might be used and it
     //                                    costs virtually nothing to do.
-    const auto was_antilag_requested = std::ranges::any_of(
-        std::span{pCreateInfo->ppEnabledExtensionNames,
-                  pCreateInfo->enabledExtensionCount},
-        [](const auto& ext) {
+    const auto was_antilag_requested =
+        std::ranges::any_of(enabled_extensions, [](const auto& ext) {
             return std::string_view{ext} == VK_AMD_ANTI_LAG_EXTENSION_NAME;
         });
 
@@ -201,7 +203,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
     }
 
     const auto create_info = find_link<VkLayerDeviceCreateInfo>(
-        pCreateInfo->pNext, VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO);
+        pCreateInfo, VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO);
     if (!create_info || !create_info->u.pLayerInfo) {
         return VK_ERROR_INITIALIZATION_FAILED;
     }
@@ -216,9 +218,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
 
     // Build a next extensions vector from what they have requested.
     const auto next_extensions = [&]() -> std::vector<const char*> {
-        auto next_extensions = std::span{pCreateInfo->ppEnabledExtensionNames,
-                                         pCreateInfo->enabledExtensionCount} |
-                               std::ranges::to<std::vector>();
+        auto next_extensions = std::vector(std::from_range, enabled_extensions);
 
         // Don't append anything extra if we don't support what we need.
         if (!context->supports_required_extensions) {
@@ -251,6 +251,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
     if (const auto result = context->instance.vtable.CreateDevice(
             physical_device, &next_create_info, pAllocator, pDevice);
         result != VK_SUCCESS) {
+
         return result;
     }
 
@@ -327,6 +328,9 @@ GetDeviceQueue(VkDevice device, std::uint32_t queue_family_index,
 
     const auto context = layer_context.get_context(device);
 
+    // Get device queue, unlike CreateDevice or CreateInstance, can be
+    // called multiple times to return the same queue object. Our insertion
+    // handling has to be a little different where we account for this.
     context->vtable.GetDeviceQueue(device, queue_family_index, queue_index,
                                    queue);
     if (!queue || !*queue) {
@@ -344,8 +348,7 @@ GetDeviceQueue(VkDevice device, std::uint32_t queue_family_index,
                                                     queue_family_index);
     }
 
-    // it->second should be QueueContext, also it might already be there
-    // but this is expected.
+    // it->second should be QueueContext, also it might already be there.
     const auto ptr = std::dynamic_pointer_cast<QueueContext>(it->second);
     assert(ptr);
     context->queues.emplace(*queue, ptr);
@@ -617,8 +620,7 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2(
     vtable.GetPhysicalDeviceFeatures2(physical_device, pFeatures);
 
     const auto feature = find_next<VkPhysicalDeviceAntiLagFeaturesAMD>(
-        pFeatures->pNext,
-        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD);
+        pFeatures, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD);
 
     if (feature) {
         feature->antiLag = context->supports_required_extensions;
@@ -633,6 +635,7 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2KHR(
 static VKAPI_ATTR void VKAPI_CALL
 AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) {
     const auto context = layer_context.get_context(device);
+    assert(pData);
     context->notify_antilag_update(*pData);
 }
 
diff --git a/src/layer_context.cc b/src/layer_context.cc
index ceb0030..28a94b5 100644
--- a/src/layer_context.cc
+++ b/src/layer_context.cc
@@ -1,8 +1,16 @@
 #include "layer_context.hh"
 
+#include <cstdlib> // for env var
+#include <string_view>
+
 namespace low_latency {
 
-LayerContext::LayerContext() {}
+LayerContext::LayerContext() {
+    this->is_antilag_1_enabled = []() -> auto {
+        const auto env = std::getenv(LayerContext::SLEEP_AFTER_PRESENT_ENV);
+        return env && std::string_view{env} == "1";
+    }();
+}
 
 LayerContext::~LayerContext() {}
 
diff --git a/src/layer_context.hh b/src/layer_context.hh
index 44857d4..c98768b 100644
--- a/src/layer_context.hh
+++ b/src/layer_context.hh
@@ -13,17 +13,15 @@
 
 // The purpose of this file is to provide a definition for the highest level
 // entry point struct of our vulkan state.
-//
-// All Context structs have deleted copy/move constructors. This is because we
-// want to be extremely explicit with how/when we delete things, and this allows
-// us to use destructors for cleanup without much worry about weird copies
-// floating around. Most contexts will probably live inside std::unique_ptr's as
-// a result so they can be used in standard containers.
 
 namespace low_latency {
 
 // All these templates do is make it so we can go from some DispatchableType
-// to their respective context's with nice syntax.
+// to their respective context's with nice syntax. This lets us write something
+// like this for all DispatchableTypes:
+//
+//     const auto device_context = get_context(some_vk_device);
+//           ^ It was automatically deduced as DeviceContext, wow!
 
 template <typename T>
 concept DispatchableType =
@@ -49,10 +47,18 @@ template <DispatchableType D>
 using dispatch_context_t = typename context_for_t<D>::context;
 
 struct LayerContext final : public Context {
+  private:
+    // If this is not null and set to exactly "1", then we should sleep after
+    // present.
+    static constexpr auto SLEEP_AFTER_PRESENT_ENV =
+        "LOW_LATENCY_LAYER_SLEEP_AFTER_PRESENT";
+
   public:
     std::mutex mutex;
     std::unordered_map<void*, std::shared_ptr<Context>> contexts;
 
+    bool is_antilag_1_enabled = false;
+
   public:
     LayerContext();
     virtual ~LayerContext();
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 2096df3..9fe25b3 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -1,5 +1,6 @@
 #include "queue_context.hh"
 #include "device_context.hh"
+#include "layer_context.hh"
 #include "timestamp_pool.hh"
 
 #include <algorithm>
@@ -158,9 +159,14 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
     // any particular queue.
     this->device_context.notify_queue_present(*this);
 
-    // If antilag is on, the sleep will occur in notify_antilag_update at the
-    // device context.
-    if (this->device_context.antilag_mode != VK_ANTI_LAG_MODE_ON_AMD) {
+    // We should only sleep in present if two conditions are met:
+    //     1. Our antilag_mode isn't set to on, because otherwise the sleep will
+    //        be done in input and with far better results.
+    //     2. The 'is_antilag_1_enabled' flag, which exists at the layer's
+    //        context, is set.
+    if (this->device_context.antilag_mode != VK_ANTI_LAG_MODE_ON_AMD &&
+        this->device_context.instance.layer.is_antilag_1_enabled) {
+
         this->sleep_in_present();
     }
 }
@@ -268,11 +274,11 @@ void QueueContext::drain_frames_to_timings() {
         const auto cpu_start = [&]() -> auto {
             if (const auto it = std::rbegin(this->timings);
                 it != std::rend(this->timings)) {
+
                 return (*it)->frame.cpu_post_present_time;
             }
-            // This will happen *once*, and only for the first frame. We don't
-            // have a way of knowing when the CPU first started work obviously
-            // in this case because we're a vulkan layer and not omniscient.
+            // This will happen once, only for the first frame. We don't
+            // have a way of knowing when the CPU first started work here.
             // Just return our first submit's start for this edge case.
             return frame.submissions.front()->start_handle->get_time_required();
         }();