aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicolas James <nj3ahxac@gmail.com>2026-03-29 20:44:23 +1100
committerNicolas James <nj3ahxac@gmail.com>2026-03-29 20:44:23 +1100
commit681bd5096ee416b50dd7338de30af7b3db385a36 (patch)
tree358b6bc6f9a3af66729b8ac3b15dd38cc0f4bd2a
parentd5ef2dbbd77c69dd93e92d5b7046a65c2361b59b (diff)
Implement Reflex - break AntiLag in the process. Remove AntiLag1. WIP
-rw-r--r--src/device_clock.cc73
-rw-r--r--src/device_clock.hh49
-rw-r--r--src/device_context.cc107
-rw-r--r--src/device_context.hh56
-rw-r--r--src/helper.cc5
-rw-r--r--src/helper.hh59
-rw-r--r--src/layer.cc141
-rw-r--r--src/physical_device_context.hh8
-rw-r--r--src/queue_context.cc352
-rw-r--r--src/queue_context.hh96
-rw-r--r--src/swapchain_monitor.cc112
-rw-r--r--src/swapchain_monitor.hh69
-rw-r--r--src/timestamp_pool.cc54
-rw-r--r--src/timestamp_pool.hh20
14 files changed, 575 insertions, 626 deletions
diff --git a/src/device_clock.cc b/src/device_clock.cc
new file mode 100644
index 0000000..52c86d3
--- /dev/null
+++ b/src/device_clock.cc
@@ -0,0 +1,73 @@
+#include "device_clock.hh"
+#include "device_context.hh"
+
+#include <vulkan/vulkan_core.h>
+
+#include <cassert>
+#include <time.h>
+
+namespace low_latency {
+
+DeviceClock::DeviceClock(const DeviceContext& context) : device(context) {
+ this->calibrate();
+}
+
+DeviceClock::~DeviceClock() {}
+
+DeviceClock::time_point_t DeviceClock::now() {
+ auto ts = timespec{};
+ if (clock_gettime(CLOCK_MONOTONIC, &ts)) {
+ throw errno;
+ }
+
+ return time_point_t{std::chrono::seconds{ts.tv_sec} +
+ std::chrono::nanoseconds{ts.tv_nsec}};
+}
+
+void DeviceClock::calibrate() {
+ const auto infos = std::vector<VkCalibratedTimestampInfoKHR>{
+ {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr,
+ VK_TIME_DOMAIN_DEVICE_EXT},
+ {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr,
+ VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT}};
+
+ struct CalibratedResult {
+ std::uint64_t device;
+ std::uint64_t host;
+ };
+ auto calibrated_result = CalibratedResult{};
+
+ THROW_NON_VKSUCCESS(device.vtable.GetCalibratedTimestampsKHR(
+ device.device, 2, std::data(infos), &calibrated_result.device,
+ &this->error_bound));
+
+ this->device_ticks = calibrated_result.device;
+ this->host_ns = calibrated_result.host;
+}
+
+DeviceClock::time_point_t
+DeviceClock::ticks_to_time(const std::uint64_t& ticks) const {
+ const auto& pd = device.physical_device.properties;
+ const auto ns_tick = static_cast<double>(pd->limits.timestampPeriod);
+
+ const auto diff = [&]() -> auto {
+ auto a = this->device_ticks;
+ auto b = ticks;
+ const auto is_negative = a > b;
+ if (is_negative) {
+ std::swap(a, b);
+ }
+ const auto abs_diff = b - a;
+ assert(abs_diff <= std::numeric_limits<std::int64_t>::max());
+ const auto signed_abs_diff = static_cast<std::int64_t>(abs_diff);
+ return is_negative ? -signed_abs_diff : signed_abs_diff;
+ }();
+
+ const auto diff_nsec =
+ static_cast<std::int64_t>(static_cast<double>(diff) * ns_tick + 0.5);
+ const auto delta = std::chrono::nanoseconds(
+ this->host_ns + static_cast<std::uint64_t>(diff_nsec));
+ return time_point_t{delta};
+}
+
+} // namespace low_latency \ No newline at end of file
diff --git a/src/device_clock.hh b/src/device_clock.hh
new file mode 100644
index 0000000..a52c59c
--- /dev/null
+++ b/src/device_clock.hh
@@ -0,0 +1,49 @@
+#ifndef CLOCK_HH_
+#define CLOCK_HH_
+
+#include <chrono>
+
+// This header provides a DeviceClock that abstracts away the Vulkan details of
+// comparing CPU and GPU times.
+
+namespace low_latency {
+
+class DeviceContext;
+
+class DeviceClock final {
+ public:
+ // FIXME this is bad, see now().
+ using time_point_t = std::chrono::time_point<std::chrono::steady_clock,
+ std::chrono::nanoseconds>;
+ const DeviceContext& device;
+
+ public:
+ std::uint64_t host_ns;
+ std::uint64_t error_bound;
+ std::uint64_t device_ticks;
+
+ public:
+ DeviceClock(const DeviceContext& device);
+ DeviceClock(const DeviceClock&) = delete;
+ DeviceClock(DeviceClock&&) = delete;
+ DeviceClock operator=(const DeviceClock&) = delete;
+ DeviceClock operator=(DeviceClock&&) = delete;
+ ~DeviceClock();
+
+ public:
+ // WARNING: This *MUST* be used over std::chrono::steady_clock::now if
+ // you're planning on comparing it to a device's clock. If it isn't, the
+ // timestamps might from different domains and will be completely
+ // nonsensical.
+ // FIXME we should be able to fix this with a tiny wrapper class of
+ // time_point_t that enforces typesafety.
+ static time_point_t now();
+
+ public:
+ void calibrate();
+ time_point_t ticks_to_time(const std::uint64_t& ticks) const;
+};
+
+} // namespace low_latency
+
+#endif \ No newline at end of file
diff --git a/src/device_context.cc b/src/device_context.cc
index 58737e2..5438e40 100644
--- a/src/device_context.cc
+++ b/src/device_context.cc
@@ -1,6 +1,5 @@
#include "device_context.hh"
-#include <time.h>
#include <utility>
#include <vulkan/vulkan_core.h>
@@ -9,15 +8,15 @@ namespace low_latency {
DeviceContext::DeviceContext(InstanceContext& parent_instance,
PhysicalDeviceContext& parent_physical_device,
const VkDevice& device,
- const bool was_antilag_requested,
+ const bool was_capability_requested,
VkuDeviceDispatchTable&& vtable)
: instance(parent_instance), physical_device(parent_physical_device),
- was_antilag_requested(was_antilag_requested), device(device),
+ was_capability_requested(was_capability_requested), device(device),
vtable(std::move(vtable)) {
// Only create our clock if we can support creating it.
if (this->physical_device.supports_required_extensions) {
- this->clock = std::make_unique<Clock>(*this);
+ this->clock = std::make_unique<DeviceClock>(*this);
}
}
@@ -29,72 +28,10 @@ DeviceContext::~DeviceContext() {
}
}
-DeviceContext::Clock::Clock(const DeviceContext& context) : device(context) {
- this->calibrate();
-}
-
-DeviceContext::Clock::~Clock() {}
-
-DeviceContext::Clock::time_point_t DeviceContext::Clock::now() {
- auto ts = timespec{};
- if (clock_gettime(CLOCK_MONOTONIC, &ts)) {
- throw errno;
- }
-
- return time_point_t{std::chrono::seconds{ts.tv_sec} +
- std::chrono::nanoseconds{ts.tv_nsec}};
-}
-
-void DeviceContext::Clock::calibrate() {
- const auto infos = std::vector<VkCalibratedTimestampInfoKHR>{
- {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr,
- VK_TIME_DOMAIN_DEVICE_EXT},
- {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr,
- VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT}};
-
- struct CalibratedResult {
- std::uint64_t device;
- std::uint64_t host;
- };
- auto calibrated_result = CalibratedResult{};
-
- THROW_NON_VKSUCCESS(device.vtable.GetCalibratedTimestampsKHR(
- device.device, 2, std::data(infos), &calibrated_result.device,
- &this->error_bound));
-
- this->device_ticks = calibrated_result.device;
- this->host_ns = calibrated_result.host;
-}
-
-DeviceContext::Clock::time_point_t
-DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const {
- const auto& pd = device.physical_device.properties;
- const auto ns_tick = static_cast<double>(pd->limits.timestampPeriod);
-
- const auto diff = [&]() -> auto {
- auto a = this->device_ticks;
- auto b = ticks;
- const auto is_negative = a > b;
- if (is_negative) {
- std::swap(a, b);
- }
- const auto abs_diff = b - a;
- assert(abs_diff <= std::numeric_limits<std::int64_t>::max());
- const auto signed_abs_diff = static_cast<std::int64_t>(abs_diff);
- return is_negative ? -signed_abs_diff : signed_abs_diff;
- }();
-
- const auto diff_nsec =
- static_cast<std::int64_t>(static_cast<double>(diff) * ns_tick + 0.5);
- const auto delta = std::chrono::nanoseconds(
- this->host_ns + static_cast<std::uint64_t>(diff_nsec));
- return time_point_t{delta};
-}
-
+/*
void DeviceContext::sleep_in_input() {
// TODO
- /*
// Present hasn't happened yet, we don't know what queue to attack.
if (!this->present_queue) {
return;
@@ -122,30 +59,36 @@ void DeviceContext::sleep_in_input() {
// would get huge frame drops, loss of throughput, and the GPU would even
// clock down. So naturally I am concerned about this approach, but it seems
// to perform well so far in my own testing and is just beautifully elegant.
- */
}
+*/
-void DeviceContext::update_swapchain_infos(
+void DeviceContext::update_params(
const std::optional<VkSwapchainKHR> target,
const std::chrono::milliseconds& present_delay,
const bool was_low_latency_requested) {
- const auto write = SwapchainInfo{
- .present_delay = present_delay,
- .was_low_latency_requested = was_low_latency_requested,
- };
-
- if (target.has_value()) {
- const auto iter = this->swapchain_infos.find(*target);
- assert(iter != std::end(this->swapchain_infos)); // Must exist (spec).
- iter->second = write;
+ // If we don't have a target (AMD's anti_lag doesn't differentiate between
+ // swapchains), just write it to everything.
+ if (!target.has_value()) {
+ for (auto& iter : this->swapchain_monitors) {
+ iter.second.update_params(was_low_latency_requested, present_delay);
+ }
return;
}
- // If we don't have a target (AMD's anti_lag), just write it to everything.
- for (auto& iter : this->swapchain_infos) {
- iter.second = write;
- }
+ const auto iter = this->swapchain_monitors.find(*target);
+ assert(iter != std::end(this->swapchain_monitors));
+ iter->second.update_params(was_low_latency_requested, present_delay);
+}
+
+void DeviceContext::notify_present(
+ const VkSwapchainKHR& swapchain,
+ const QueueContext::submissions_t& submissions) {
+
+ const auto iter = this->swapchain_monitors.find(swapchain);
+ assert(iter != std::end(this->swapchain_monitors));
+
+ iter->second.notify_present(submissions);
}
} // namespace low_latency \ No newline at end of file
diff --git a/src/device_context.hh b/src/device_context.hh
index 6b5f000..172801c 100644
--- a/src/device_context.hh
+++ b/src/device_context.hh
@@ -11,59 +11,30 @@
#include <vulkan/vulkan_core.h>
#include "context.hh"
+#include "device_clock.hh"
#include "instance_context.hh"
#include "physical_device_context.hh"
+#include "queue_context.hh"
+#include "swapchain_monitor.hh"
namespace low_latency {
-class QueueContext;
-
-struct DeviceContext final : public Context {
+class DeviceContext final : public Context {
public:
InstanceContext& instance;
PhysicalDeviceContext& physical_device;
- const bool was_antilag_requested;
+ // Whether or not we were asked to do NV_VK_LowLatency2 or VK_AMD_anti_lag.
+ const bool was_capability_requested;
const VkDevice device;
const VkuDeviceDispatchTable vtable;
- // Tiny struct to represent any swapchain's low latency state.
- struct SwapchainInfo {
- std::chrono::milliseconds present_delay = std::chrono::milliseconds{0};
- bool was_low_latency_requested = false;
- };
- std::unordered_map<VkSwapchainKHR, SwapchainInfo> swapchain_infos{};
-
std::unordered_map<VkQueue, std::shared_ptr<QueueContext>> queues;
- struct Clock {
- public:
- using time_point_t = std::chrono::time_point<std::chrono::steady_clock,
- std::chrono::nanoseconds>;
- const DeviceContext& device;
-
- public:
- std::uint64_t host_ns;
- std::uint64_t error_bound;
- std::uint64_t device_ticks;
-
- public:
- Clock(const DeviceContext& device);
- ~Clock();
+ std::unique_ptr<DeviceClock> clock;
- public:
- // WARNING: This *MUST* be used over std::chrono::steady_clock::now if
- // you're planning on comparing it to a device's clock. If it isn't, the
- // timestamps might from different domains and will be completely
- // nonsensical.
- static time_point_t now();
-
- public:
- void calibrate();
- time_point_t ticks_to_time(const std::uint64_t& ticks) const;
- };
- std::unique_ptr<Clock> clock;
+ std::unordered_map<VkSwapchainKHR, SwapchainMonitor> swapchain_monitors;
public:
DeviceContext(InstanceContext& parent_instance,
@@ -73,13 +44,14 @@ struct DeviceContext final : public Context {
virtual ~DeviceContext();
public:
- void sleep_in_input();
-
// Updates the settings associated with that swapchain. If none is provided
// all swapchains are set to this value.
- void update_swapchain_infos(const std::optional<VkSwapchainKHR> target,
- const std::chrono::milliseconds& present_delay,
- const bool was_low_latency_requested);
+ void update_params(const std::optional<VkSwapchainKHR> target,
+ const std::chrono::milliseconds& present_delay,
+ const bool was_low_latency_requested);
+
+ void notify_present(const VkSwapchainKHR& swapchain,
+ const QueueContext::submissions_t& submissions);
};
}; // namespace low_latency
diff --git a/src/helper.cc b/src/helper.cc
new file mode 100644
index 0000000..bb17c59
--- /dev/null
+++ b/src/helper.cc
@@ -0,0 +1,5 @@
+#include "helper.hh"
+
+namespace low_latency {
+
+} \ No newline at end of file
diff --git a/src/helper.hh b/src/helper.hh
new file mode 100644
index 0000000..468f146
--- /dev/null
+++ b/src/helper.hh
@@ -0,0 +1,59 @@
+#ifndef HELPER_HH_
+#define HELPER_HH_
+
+#include <vulkan/vk_layer.h>
+#include <vulkan/vulkan.h>
+
+#include <cstdint>
+
+namespace low_latency {
+
+// Small templates which allow us to SFINAE find pNext structs.
+template <typename T>
+static T* find_next(void* const head, const VkStructureType& stype) {
+ for (auto i = reinterpret_cast<VkBaseOutStructure*>(head)->pNext; i;
+ i = i->pNext) {
+
+ if (i->sType == stype) {
+ return reinterpret_cast<T*>(i);
+ }
+ }
+ return nullptr;
+}
+
+template <typename T>
+static const T* find_next(const void* const head,
+ const VkStructureType& stype) {
+
+ for (auto i = reinterpret_cast<const VkBaseInStructure*>(head)->pNext; i;
+ i = i->pNext) {
+
+ if (i->sType == stype) {
+ return reinterpret_cast<const T*>(i);
+ }
+ }
+ return nullptr;
+}
+
+template <typename T>
+static const T* find_link(const void* const head,
+ const VkStructureType& stype) {
+ for (auto info = find_next<T>(head, stype); info;
+ info = find_next<T>(info, stype)) {
+
+ if (info->function == VK_LAYER_LINK_INFO) {
+ return reinterpret_cast<const T*>(info);
+ }
+ }
+ return nullptr;
+}
+
+template <typename T> std::uint64_t extract_present_id(const T& submit) {
+ const auto lspi = find_next<VkLatencySubmissionPresentIdNV>(
+ &submit, VK_STRUCTURE_TYPE_LATENCY_SUBMISSION_PRESENT_ID_NV);
+ return lspi ? lspi->presentID : 0;
+}
+
+} // namespace low_latency
+
+#endif \ No newline at end of file
diff --git a/src/layer.cc b/src/layer.cc
index 5460fca..7a7ffc8 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -14,7 +14,9 @@
#include <vulkan/vulkan.hpp>
#include <vulkan/vulkan_core.h>
+#include "device_clock.hh"
#include "device_context.hh"
+#include "helper.hh"
#include "instance_context.hh"
#include "layer_context.hh"
#include "queue_context.hh"
@@ -28,46 +30,6 @@ LayerContext layer_context;
} // namespace
-// Small templates which allow us to SFINAE find pNext structs.
-template <typename T>
-static T* find_next(void* const head, const VkStructureType& stype) {
- for (auto i = reinterpret_cast<VkBaseOutStructure*>(head)->pNext; i;
- i = i->pNext) {
-
- if (i->sType == stype) {
- return reinterpret_cast<T*>(i);
- }
- }
- return nullptr;
-}
-
-template <typename T>
-static const T* find_next(const void* const head,
- const VkStructureType& stype) {
-
- for (auto i = reinterpret_cast<const VkBaseInStructure*>(head)->pNext; i;
- i = i->pNext) {
-
- if (i->sType == stype) {
- return reinterpret_cast<const T*>(i);
- }
- }
- return nullptr;
-}
-
-template <typename T>
-static const T* find_link(const void* const head,
- const VkStructureType& stype) {
- for (auto info = find_next<T>(head, stype); info;
- info = find_next<T>(info, stype)) {
-
- if (info->function == VK_LAYER_LINK_INFO) {
- return reinterpret_cast<const T*>(info);
- }
- }
- return nullptr;
-}
-
static VKAPI_ATTR VkResult VKAPI_CALL
CreateInstance(const VkInstanceCreateInfo* pCreateInfo,
const VkAllocationCallbacks* pAllocator, VkInstance* pInstance) {
@@ -209,12 +171,12 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
// is not the case with AL2, because the vulkan application has to
// explicitly ask for the extension when it creates the device.
- const auto was_antilag_requested =
+ const auto was_capability_requested =
requested.contains(VK_AMD_ANTI_LAG_EXTENSION_NAME) ||
requested.contains(VK_NV_LOW_LATENCY_2_EXTENSION_NAME);
const auto context = layer_context.get_context(physical_device);
- if (!context->supports_required_extensions && was_antilag_requested) {
+ if (!context->supports_required_extensions && was_capability_requested) {
return VK_ERROR_INITIALIZATION_FAILED;
}
@@ -305,7 +267,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
assert(!layer_context.contexts.contains(key));
layer_context.contexts.try_emplace(
key, std::make_shared<DeviceContext>(context->instance, *context,
- *pDevice, was_antilag_requested,
+ *pDevice, was_capability_requested,
std::move(vtable)));
return VK_SUCCESS;
@@ -443,7 +405,7 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
// more explicit + insurance if that changes.
auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{};
- const auto now = DeviceContext::Clock::now();
+ const auto now = DeviceClock::now();
std::ranges::transform(
std::span{submit_infos, submit_count}, std::back_inserter(next_submits),
@@ -451,7 +413,9 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
const auto head_handle = context->timestamp_pool->acquire();
const auto tail_handle = context->timestamp_pool->acquire();
head_handle->setup_command_buffers(*tail_handle, *context);
- context->notify_submit(submit, head_handle, tail_handle, now);
+
+ context->notify_submit(extract_present_id(submit), head_handle,
+ tail_handle, now);
handles.emplace_back(head_handle);
handles.emplace_back(tail_handle);
@@ -494,7 +458,7 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
auto next_cbs = std::vector<std::unique_ptr<cbs_t>>{};
auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{};
- const auto now = DeviceContext::Clock::now();
+ const auto now = DeviceClock::now();
std::ranges::transform(
std::span{submit_infos, submit_count}, std::back_inserter(next_submits),
@@ -502,7 +466,9 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
const auto head_handle = context->timestamp_pool->acquire();
const auto tail_handle = context->timestamp_pool->acquire();
head_handle->setup_command_buffers(*tail_handle, *context);
- context->notify_submit(submit, head_handle, tail_handle, now);
+
+ context->notify_submit(extract_present_id(submit), head_handle,
+ tail_handle, now);
handles.emplace_back(head_handle);
handles.emplace_back(tail_handle);
@@ -553,7 +519,14 @@ vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) {
return res;
}
- context->notify_present(*present_info);
+ const auto pid = find_next<VkPresentIdKHR>(
+ present_info, VK_STRUCTURE_TYPE_PRESENT_ID_KHR);
+
+ for (auto i = std::uint32_t{0}; i < present_info->swapchainCount; ++i) {
+ const auto& swapchain = present_info->pSwapchains[i];
+ const auto present_id = pid ? pid->pPresentIds[i] : 0;
+ context->notify_present(swapchain, present_id);
+ }
return VK_SUCCESS;
}
@@ -644,6 +617,17 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2(
vtable.GetPhysicalDeviceFeatures2(physical_device, pFeatures);
+ // We're going to use this feature for both VK_AMD_anti_lag and
+ // VK_NV_low_latency2. It simplifies things a bit if we share a code path
+ // for now. TODO remove it in the future for VK_AMD_anti_lag.
+ if (const auto pidf = find_next<VkPhysicalDevicePresentIdFeaturesKHR>(
+ pFeatures,
+ VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENT_ID_FEATURES_KHR);
+ pidf) {
+
+ pidf->presentId = true;
+ }
+
// Don't provide AntiLag if we're trying to spoof nvidia.
// Nvidia uses VkSurfaceCapabilities2KHR to determine if a surface
// is capable of reflex instead of AMD's physical device switch found here.
@@ -651,11 +635,11 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2(
return;
}
- const auto feature = find_next<VkPhysicalDeviceAntiLagFeaturesAMD>(
- pFeatures, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD);
+ if (const auto alf = find_next<VkPhysicalDeviceAntiLagFeaturesAMD>(
+ pFeatures, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ANTI_LAG_FEATURES_AMD);
+ alf) {
- if (feature) {
- feature->antiLag = context->supports_required_extensions;
+ alf->antiLag = context->supports_required_extensions;
}
}
@@ -707,12 +691,11 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR(
const auto lsc = find_next<VkLatencySurfaceCapabilitiesNV>(
pSurfaceCapabilities,
VK_STRUCTURE_TYPE_LATENCY_SURFACE_CAPABILITIES_NV);
-
if (!lsc) {
return;
}
- // I kind of eyeballed these!
+ // I eyeballed these - there might be more that we can support.
const auto supported_modes = std::vector<VkPresentModeKHR>{
VK_PRESENT_MODE_IMMEDIATE_KHR,
VK_PRESENT_MODE_MAILBOX_KHR,
@@ -723,7 +706,7 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR(
// They're asking how many we want to return.
if (!lsc->pPresentModes) {
- lsc->presentModeCount = static_cast<std::uint32_t>(num_supported_modes);
+ lsc->presentModeCount = num_supported_modes;
return;
}
@@ -750,19 +733,17 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateSwapchainKHR(
return result;
}
- auto addition = DeviceContext::SwapchainInfo{
- .present_delay = std::chrono::milliseconds{0},
- .was_low_latency_requested = false,
- };
-
+ // VK_NV_low_latency2 allows a swapchain to be created with the low latency
+ // mode already on via VkSwapchainLatencyCreateInfoNV.
+ auto was_low_latency_requested = false;
if (const auto slci = find_next<VkSwapchainLatencyCreateInfoNV>(
pCreateInfo, VK_STRUCTURE_TYPE_SWAPCHAIN_LATENCY_CREATE_INFO_NV);
slci) {
-
- addition.was_low_latency_requested = slci->latencyModeEnable;
- }
- assert(context->swapchain_infos.try_emplace(*pSwapchain, addition).second);
+ was_low_latency_requested = slci->latencyModeEnable;
+ }
+ context->swapchain_monitors.try_emplace(*pSwapchain, *context,
+ was_low_latency_requested);
return VK_SUCCESS;
}
@@ -772,7 +753,7 @@ DestroySwapchainKHR(VkDevice device, VkSwapchainKHR swapchain,
const VkAllocationCallbacks* pAllocator) {
const auto context = layer_context.get_context(device);
- assert(context->swapchain_infos.erase(swapchain));
+ assert(context->swapchain_monitors.erase(swapchain));
context->vtable.DestroySwapchainKHR(device, swapchain, pAllocator);
}
@@ -788,20 +769,20 @@ AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) {
// NVIDIA's method and then have a working AL2 implementation follow using
// that existing code path.
- const auto present_delay = [&]() { // lambda abuse?
+ const auto present_delay = [&]() {
using namespace std::chrono;
return duration_cast<milliseconds>(1s / pData->maxFPS);
}();
- context->update_swapchain_infos(std::nullopt, present_delay,
- (pData->mode == VK_ANTI_LAG_MODE_ON_AMD));
+ context->update_params(std::nullopt, present_delay,
+ (pData->mode == VK_ANTI_LAG_MODE_ON_AMD));
if (!pData->pPresentationInfo) {
return;
}
if (pData->pPresentationInfo->stage == VK_ANTI_LAG_STAGE_INPUT_AMD) {
- context->sleep_in_input();
+ // TODO use nvidia's path
}
}
@@ -811,16 +792,25 @@ VkResult LatencySleepNV(VkDevice device, VkSwapchainKHR swapchain,
const auto context = layer_context.get_context(device);
assert(pSleepInfo);
- // TODO sleep here
+ // We're associating an application-provided timeline semaphore + value with
+ // a swapchain that says 'signal me when we should move past input'.
+ auto& swapchain_monitor = [&]() -> auto& {
+ const auto iter = context->swapchain_monitors.find(swapchain);
+ assert(iter != std::end(context->swapchain_monitors));
+ return iter->second;
+ }();
+
+ // Tell our swapchain monitor that if they want us to proceed they should
+ // signal this semaphore.
+ swapchain_monitor.notify_semaphore(pSleepInfo->signalSemaphore,
+ pSleepInfo->value);
return VK_SUCCESS;
}
void QueueNotifyOutOfBandNV(VkQueue queue,
const VkOutOfBandQueueTypeInfoNV* pQueueTypeInfo) {
- // This is really thoughtful from NVIDIA. Having the application explicitly
- // state which queues should be ignored for latency evaluation is far
- // superior to AMD's guessing game.
+
// Kind of interesting how you can't turn it back on once it's turned off.
// Also I really have no idea why pQueueTypeInfo's VkOutOfBandQueueTypeNV
// enum even exists (I guess we will find out later when nothing works).
@@ -834,14 +824,13 @@ VkResult SetLatencySleepModeNV(VkDevice device, VkSwapchainKHR swapchain,
const auto context = layer_context.get_context(device);
if (pSleepModeInfo) {
- context->update_swapchain_infos(
+ context->update_params(
swapchain,
std::chrono::milliseconds{pSleepModeInfo->minimumIntervalUs},
pSleepModeInfo->lowLatencyMode);
} else {
// If pSleepModeInfo is nullptr, it means no delay and no low latency.
- context->update_swapchain_infos(swapchain, std::chrono::milliseconds{0},
- false);
+ context->update_params(swapchain, std::chrono::milliseconds{0}, false);
}
return VK_SUCCESS;
}
diff --git a/src/physical_device_context.hh b/src/physical_device_context.hh
index 9624faa..f7ad289 100644
--- a/src/physical_device_context.hh
+++ b/src/physical_device_context.hh
@@ -4,6 +4,7 @@
#include "instance_context.hh"
#include <vulkan/vulkan.hpp>
+#include <vulkan/vulkan_core.h>
#include "context.hh"
@@ -17,7 +18,8 @@ class PhysicalDeviceContext final : public Context {
static constexpr auto required_extensions = {
VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME,
VK_KHR_CALIBRATED_TIMESTAMPS_EXTENSION_NAME,
- VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME};
+ VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME,
+ VK_KHR_PRESENT_ID_EXTENSION_NAME};
public:
InstanceContext& instance;
@@ -29,8 +31,8 @@ class PhysicalDeviceContext final : public Context {
using queue_properties_t = std::vector<VkQueueFamilyProperties2>;
std::unique_ptr<const queue_properties_t> queue_properties;
- // Will be set to true in the constructor if the physical device supports
- // everything we need to track gpu timing data.
+ // Will be true if the physical device supports everything in
+ // this->required_extensions.
bool supports_required_extensions = false;
public:
diff --git a/src/queue_context.cc b/src/queue_context.cc
index d12f03d..30e73c1 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -3,11 +3,6 @@
#include "layer_context.hh"
#include "timestamp_pool.hh"
-#include <algorithm>
-#include <chrono>
-#include <fstream>
-#include <iostream>
-#include <ranges>
#include <span>
#include <vulkan/vulkan_core.h>
@@ -49,333 +44,52 @@ QueueContext::QueueContext(DeviceContext& device_context, const VkQueue& queue,
}
QueueContext::~QueueContext() {
- this->in_flight_frames.clear();
- this->submissions.clear();
+ this->unpresented_submissions.clear();
this->timestamp_pool.reset();
}
void QueueContext::notify_submit(
- const VkSubmitInfo& info,
+ const present_id_t& present_id,
const std::shared_ptr<TimestampPool::Handle> head_handle,
const std::shared_ptr<TimestampPool::Handle> tail_handle,
- const DeviceContext::Clock::time_point_t& now) {
-
- auto signals = std::unordered_set<VkSemaphore>{};
- auto waits = std::unordered_set<VkSemaphore>{};
- std::ranges::copy(std::span{info.pWaitSemaphores, info.waitSemaphoreCount},
- std::inserter(waits, std::end(waits)));
- std::ranges::copy(
- std::span{info.pSignalSemaphores, info.signalSemaphoreCount},
- std::inserter(signals, std::end(signals)));
-
- this->submissions.emplace_back(std::make_unique<Submission>(
- std::move(signals), std::move(waits), head_handle, tail_handle, now));
-
- if (std::size(this->submissions) > this->MAX_TRACKED_SUBMISSIONS) {
- this->submissions.pop_front();
- }
-}
-
-// Identical to notify_submit, but we use VkSubmitInfo2.
-void QueueContext::notify_submit(
- const VkSubmitInfo2& info,
- const std::shared_ptr<TimestampPool::Handle> head_handle,
- const std::shared_ptr<TimestampPool::Handle> tail_handle,
- const DeviceContext::Clock::time_point_t& now) {
-
- auto signals = std::unordered_set<VkSemaphore>{};
- auto waits = std::unordered_set<VkSemaphore>{};
-
- std::ranges::transform(
- std::span{info.pWaitSemaphoreInfos, info.waitSemaphoreInfoCount},
- std::inserter(waits, std::end(waits)),
- [](const auto& info) -> auto { return info.semaphore; });
-
- std::ranges::transform(
- std::span{info.pSignalSemaphoreInfos, info.signalSemaphoreInfoCount},
- std::inserter(signals, std::end(signals)),
- [](const auto& info) -> auto { return info.semaphore; });
-
- this->submissions.emplace_back(std::make_unique<Submission>(
- std::move(signals), std::move(waits), head_handle, tail_handle, now));
-
- if (std::size(this->submissions) > this->MAX_TRACKED_SUBMISSIONS) {
- this->submissions.pop_front();
- }
-}
-
-void QueueContext::drain_submissions_to_frame() {
-
- // We are going to assume that all queue submissions before and on the same
- // queue contribute to the frame.
-
- // This used to be more complicated where we found the first submission that
- // was signalled by acquire, then we walked forwards until we found the
- // submission before it that marked the end of frame (which was the last
- // submission in the previous frame that called notify submit). This seemed
- // completely redundant, in all cases it was exactly what we have here. But
- // I could be wrong.
-
- const auto start_iter = std::begin(this->submissions);
- // no op submit?
- if (start_iter == std::end(this->submissions)) {
- return;
+ const DeviceClock::time_point_t& now) {
+
+ // Push this submission onto our unpresented_submissions at our present_id
+ // mapping (might be empty, but handled with operator[]).
+ auto& submissions = this->unpresented_submissions[present_id];
+ if (submissions == nullptr) {
+ submissions =
+ std::make_shared<std::deque<std::unique_ptr<Submission>>>();
}
- // The last submission is either in flight, already processed, or we
- // just happen to be the first frame and we can just set it to our start
- // with little consequence.
- const auto prev_frame_last_submit = [&]() -> auto {
- if (const auto iter = std::rbegin(this->in_flight_frames);
- iter != std::rend(this->in_flight_frames)) {
-
- assert(!iter->submissions.empty());
- return iter->submissions.back();
- }
-
- if (const auto iter = std::rbegin(this->timings);
- iter != std::rend(this->timings)) {
+ submissions->push_back(
+ std::make_unique<Submission>(Submission{.head_handle = head_handle,
+ .tail_handle = tail_handle,
+ .cpu_present_time = now}));
- const auto& submissions = (*iter)->frame.submissions;
- assert(!submissions.empty());
-
- return submissions.back();
- }
-
- return *start_iter;
- }();
-
- this->in_flight_frames.emplace_back(
- Frame{.submissions = std::move(this->submissions),
- .cpu_post_present_time = DeviceContext::Clock::now()});
- assert(std::size(this->in_flight_frames.back().submissions));
- // *valid but unspecified state after move, so clear!*
- this->submissions.clear();
-}
-
-void QueueContext::notify_present(const VkPresentInfoKHR& info) {
- this->drain_submissions_to_frame();
- this->drain_frames_to_timings();
-
- // We should only sleep in present if two conditions are met:
- // 1. Our antilag_mode isn't set to on, because otherwise the sleep will
- // be done in input and with far better results.
- // 2. The 'is_antilag_1_enabled' flag, which exists at the layer's
- // context, is set.
- //
- /*
- * WIP REFLEX
- if (this->device_context.antilag_mode != VK_ANTI_LAG_MODE_ON_AMD &&
- this->device_context.instance.layer.is_antilag_1_enabled) {
-
- this->sleep_in_present();
+ // This is probably hit if our queue never actually presents to anything,
+ // because the only time we manually evict our unpresent_submissions is
+ // when we present to something.
+ if (std::size(*submissions) > this->MAX_TRACKED_SUBMISSIONS) {
+ submissions->pop_front();
}
- */
}
-const auto debug_log_time2 = [](auto& stream, const auto& diff) {
- using namespace std::chrono;
- const auto ms = duration_cast<milliseconds>(diff);
- const auto us = duration_cast<microseconds>(diff - ms);
- const auto ns = duration_cast<nanoseconds>(diff - ms - us);
- stream << ms << " " << us << " " << ns << " ago\n";
-};
+void QueueContext::notify_present(const VkSwapchainKHR& swapchain,
+ const present_id_t& present_id) {
-void QueueContext::drain_frames_to_timings() {
- if (!std::size(this->in_flight_frames)) {
- return;
+ // Notify the device that this swapchain was just presented to.
+ // We're avoiding a double hash here - don't use operator[] and erase.
+ auto iter = this->unpresented_submissions.try_emplace(present_id).first;
+ if (iter->second == nullptr) {
+ iter->second =
+ std::make_shared<std::deque<std::unique_ptr<Submission>>>();
}
- // Only need to calibrate this device, we don't support multi device anti
- // lag.
- this->device_context.clock->calibrate();
-
- while (std::size(this->in_flight_frames)) {
- const auto& frame = this->in_flight_frames.front();
-
- assert(std::size(frame.submissions));
-
- const auto& last_submission = frame.submissions.back();
-
- // Not completed (so future frames definitely aren't) - stop early.
- if (!last_submission->end_handle->get_time().has_value()) {
- break;
- }
-
- // We are committed to removing the frame at this stage and
- // promoting it to a 'timing' struct because it's completed.
- // We can guarantee that we can extract timing information from
- // all start/end handles now.
-
- // Using leetcode merge intervals in the wild lol
- struct Interval {
- DeviceContext::Clock::time_point_t start, end;
- };
-
- const auto sorted_intervals = [&]() -> auto {
- auto intervals = std::vector<Interval>{};
- std::ranges::transform(
- frame.submissions, std::back_inserter(intervals),
- [&](const auto& submission) {
- return Interval{
- .start = submission->start_handle->get_time_required(),
- .end = submission->end_handle->get_time_required(),
- };
- });
-
- std::ranges::sort(intervals, [](const auto& a, const auto& b) {
- return a.start < b.start;
- });
- return intervals;
- }();
-
- const auto merged = [&]() -> auto {
- auto merged = std::vector<Interval>{};
- auto last = sorted_intervals[0];
-
- for (const auto& [s, e] : sorted_intervals | std::views::drop(1)) {
- if (s <= last.end) {
- last.end = std::max(last.end, e);
- } else {
- merged.push_back(last);
- last = {s, e};
- }
- }
- merged.push_back(last);
- return merged;
- }();
-
- // It's important to note that gputime starts from a point which isn't
- // equal to the below 'start' var. It looks something like this, where a
- // '-' represents CPU time only and '=' represents CPU + GPU.
- //
- // |---------------------|=========|--------|====|-----------------|
- // ^ last_present ^ merged.front().start present ^
- // merged.back().end ^
- //
- // I would imagine there would be more GPU than cpu to reach the anti
- // lag codepath than is depicted here. We can track the total time
- // between vkPresent calls as future_submit - last_submit. The total
- // time the GPU spent engaged is the sum of all intervals. So we can
- // get a meaningful 'not_gputime' as total - gpu_time.
-
- const auto gputime = std::ranges::fold_left(
- merged, DeviceContext::Clock::time_point_t::duration{},
- [](auto gputime, const auto& interval) {
- const auto& [start, end] = interval;
- return gputime + (end - start);
- });
-
- // Our cpu_start value here refers to the time when the CPU was allowed
- // to move past the present call and, in theory, begin cpu work on the
- // next frame.
- const auto cpu_start = [&]() -> auto {
- if (const auto it = std::rbegin(this->timings);
- it != std::rend(this->timings)) {
-
- return (*it)->frame.cpu_post_present_time;
- }
- // This will happen once, only for the first frame. We don't
- // have a way of knowing when the CPU first started work here.
- // Just return our first submit's start for this edge case.
- return frame.submissions.front()->start_handle->get_time_required();
- }();
-
- const auto cputime =
- frame.submissions.front()->enqueued_time - cpu_start;
-
- this->timings.emplace_back(std::make_unique<Timing>(Timing{
- .gputime = gputime,
- .cputime = cputime,
- .frame = frame,
- }));
-
- this->in_flight_frames.pop_front();
- }
-
- if (const auto T = std::size(this->timings);
- T > this->MAX_TRACKED_TIMINGS) {
-
- const auto erase_to_iter =
- std::next(std::begin(this->timings),
- static_cast<long>(T - MAX_TRACKED_TIMINGS));
- this->timings.erase(std::begin(this->timings), erase_to_iter);
- }
-}
-
-void QueueContext::sleep_in_present() {
- // After calling this, any remaining frames are truly in flight.
- this->drain_frames_to_timings();
- if (!std::size(this->in_flight_frames)) {
- return;
- }
-
- // This is getting the most recent frame and waiting until its start has
- // begun. This means that, in the case of >1 frame in flight, it's draining
- // all of them before we're allowed to move forward.
- const auto first_gpu_work = [&]() -> auto {
- const auto& most_recent_frame = this->in_flight_frames.back();
- const auto& first_submission = most_recent_frame.submissions.front();
- return first_submission->start_handle->get_time_spinlock();
- }();
-
- // Drain frames again because as stated above, we might have multiple frames
- // now completed after our wait spinlock.
- this->drain_frames_to_timings();
-
- // Check the size again because the frame we want to target may have already
- // completed when we called process_frames().
- if (!std::size(this->in_flight_frames)) {
- return;
- }
- assert(std::size(this->in_flight_frames) == 1);
-
- // Not enough data yet to apply any delays.
- if (std::size(this->timings) < this->MAX_TRACKED_TIMINGS) {
- return;
- }
-
- const auto calc_median = [&, this](const auto& getter) {
- auto vect = std::vector<Timing*>{};
- std::ranges::transform(this->timings, std::back_inserter(vect),
- [](const auto& timing) { return timing.get(); });
- std::ranges::sort(vect, [&](const auto& a, const auto& b) {
- return getter(a) < getter(b);
- });
- return getter(vect[std::size(vect) / 2]);
- };
-
- const auto expected_gputime =
- calc_median([](const auto& timing) { return timing->gputime; });
- const auto expected_cputime =
- calc_median([](const auto& timing) { return timing->cputime; });
-
- // Should look like this:
- // total_length = expected_gputime
- // |------------------------x------------------------------|
- // ^ first_gpu_work now last_gpu_work ^
-
- const auto now = DeviceContext::Clock::now();
- const auto dist = now - first_gpu_work;
- const auto expected_dist_to_last = expected_gputime - dist;
-
- const auto wait_time = expected_dist_to_last - expected_cputime;
-
- auto& frame = this->in_flight_frames.back();
- const auto& last_gpu_work = frame.submissions.back()->end_handle;
- last_gpu_work->get_time_spinlock(now + wait_time);
-
- frame.cpu_post_present_time = std::chrono::steady_clock::now();
+ this->device_context.notify_present(swapchain, iter->second);
- std::ofstream f("/tmp/times.txt", std::ios::trunc);
- f << " expected gputime: ";
- debug_log_time2(f, expected_gputime);
- f << " expected cputime: ";
- debug_log_time2(f, expected_cputime);
- f << " requestd sleep: ";
- debug_log_time2(f, wait_time);
- f << " observed sleep: ";
- debug_log_time2(f, frame.cpu_post_present_time - now);
+ // Important, we nuke the submission because now it's presented.
+ this->unpresented_submissions.erase(iter);
}
bool QueueContext::should_inject_timestamps() const {
@@ -385,9 +99,9 @@ bool QueueContext::should_inject_timestamps() const {
return false;
}
- // Don't bother injecting timestamps during queue submission if both AL1 and
- // AL2 are disabled.
- if (!this->device_context.was_antilag_requested &&
+ // Don't bother injecting timestamps during queue submission if we
+ // aren't planning on doing anything anyway.
+ if (!this->device_context.was_capability_requested &&
!physical_device.instance.layer.is_antilag_1_enabled) {
return false;
diff --git a/src/queue_context.hh b/src/queue_context.hh
index 221626f..48500e1 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -2,33 +2,23 @@
#define QUEUE_STATE_HH_
#include "context.hh"
-#include "device_context.hh"
+#include "device_clock.hh"
#include "timestamp_pool.hh"
#include <vulkan/utility/vk_dispatch_table.h>
#include <vulkan/vulkan.hpp>
-#include <chrono>
#include <deque>
#include <memory>
-#include <unordered_set>
+#include <unordered_map>
namespace low_latency {
class QueueContext final : public Context {
private:
- // The amount of finished frame timing data we keep before eviction.
- // For now, this value is also the number of data points used in the
- // calculation of gpu timing information.
- static constexpr auto MAX_TRACKED_TIMINGS = 50u;
// The amount of queue submissions we allow tracked per queue before
- // we give up tracking them. For a queue that is presented to,
- // these submissions will be constantly moved to Frame structs so
- // it's not an issue that we only track so many - unless it just
- // happens that an application makes an unexpectedly large
- // amount of vkQueueSubmit's per frame. For queues which don't
- // present, this limit stops them from growing limitlessly in memory
- // as we may not necessarily manually evict them yet.
+ // we give up tracking them. This is neccessary for queues which do not
+ // present anything.
static constexpr auto MAX_TRACKED_SUBMISSIONS = 50u;
public:
@@ -59,55 +49,35 @@ class QueueContext final : public Context {
// NVIDIA's extension lets the application explicitly state that this queue
// does not contribute to the frame. AMD's extension has no such mechanism -
- // so this will always be false.
+ // so this will always be false when using VK_AMD_anti_lag.
bool should_ignore_latency = false;
public:
- // Potentially in flight queue submissions that come from this queue.
+ // I want our queue bookkeeping to be fairly simple and do one thing - track
+ // submissions that have yet to have been presented to a swapchain. General
+ // idea:
+ //
+ // For each vkQueueSubmit (specifically for each pSubmitInfo in that
+ // hook) grab the VK_EXT_present_id value provided by the application for
+ // that submission. Once we add our timing objects as part of the hook, we
+ // then take those timing objects, bundle them into a Submission struct, and
+ // append it to the (potentially currently nonexistent) mapping of
+ // present_id's to deque<Submission>'s. Now we cleanly track what queue
+ // submissions refer to what present_id.
+ //
+ // When our hook sees a VkQueuePresentKHR, we take the provided present_id
+ // and notify our device that it needs to watch for when this completes.
+ // We give it our submission. Now, it's out of our hands. We remove the
+ // present_id_t mapping when doing so.
struct Submission {
- const std::unordered_set<VkSemaphore> signals;
- const std::unordered_set<VkSemaphore> waits;
-
- const std::shared_ptr<TimestampPool::Handle> start_handle;
- const std::shared_ptr<TimestampPool::Handle> end_handle;
-
- const DeviceContext::Clock::time_point_t enqueued_time;
- };
- using submission_ptr_t = std::shared_ptr<Submission>;
- std::deque<submission_ptr_t> submissions;
-
- // In flight frame submissions grouped together.
- // The first element in the deque refers to the first submission that
- // contributed to that frame. The last element is the last submission before
- // present was called.
- // std::size(submissions) >= 1 btw
- struct Frame {
- std::deque<submission_ptr_t> submissions;
-
- // the point that control flow was returned from VkQueuePresentKHR back
- // to the application.
- DeviceContext::Clock::time_point_t cpu_post_present_time;
- };
- std::deque<Frame> in_flight_frames;
-
- // Completed frames.
- struct Timing {
- DeviceContext::Clock::time_point_t::duration gputime, cputime;
-
- Frame frame;
+ std::shared_ptr<TimestampPool::Handle> head_handle, tail_handle;
+ DeviceClock::time_point_t cpu_present_time;
};
- std::deque<std::unique_ptr<Timing>> timings;
- private:
- // Drains submissions and promotes them into a single frame object.
- void drain_submissions_to_frame();
-
- // Drains in flight frames and promotes them into a Timing object if they
- // have completed.
- void drain_frames_to_timings();
-
- // Antilag 1 equivalent where we sleep after present to reduce queueing.
- void sleep_in_present();
+ using submissions_t =
+ std::shared_ptr<std::deque<std::unique_ptr<Submission>>>;
+ using present_id_t = std::uint64_t;
+ std::unordered_map<present_id_t, submissions_t> unpresented_submissions;
public:
QueueContext(DeviceContext& device_context, const VkQueue& queue,
@@ -115,17 +85,13 @@ class QueueContext final : public Context {
virtual ~QueueContext();
public:
- void notify_submit(const VkSubmitInfo& info,
- const std::shared_ptr<TimestampPool::Handle> head_handle,
- const std::shared_ptr<TimestampPool::Handle> tail_handle,
- const DeviceContext::Clock::time_point_t& now);
-
- void notify_submit(const VkSubmitInfo2& info,
+ void notify_submit(const present_id_t& present_id,
const std::shared_ptr<TimestampPool::Handle> head_handle,
const std::shared_ptr<TimestampPool::Handle> tail_handle,
- const DeviceContext::Clock::time_point_t& now);
+ const DeviceClock::time_point_t& now);
- void notify_present(const VkPresentInfoKHR& info);
+ void notify_present(const VkSwapchainKHR& swapchain,
+ const std::uint64_t& present_id);
public:
bool should_inject_timestamps() const;
diff --git a/src/swapchain_monitor.cc b/src/swapchain_monitor.cc
new file mode 100644
index 0000000..09fa8ba
--- /dev/null
+++ b/src/swapchain_monitor.cc
@@ -0,0 +1,112 @@
+#include "swapchain_monitor.hh"
+#include "device_context.hh"
+
+#include <vulkan/vulkan_core.h>
+
+#include <functional>
+#include <mutex>
+
+namespace low_latency {
+
+SwapchainMonitor::SwapchainMonitor(const DeviceContext& device,
+ const bool was_low_latency_requested)
+ : device(device), was_low_latency_requested(was_low_latency_requested),
+ swapchain_worker(
+ std::bind_front(&SwapchainMonitor::do_swapchain_monitor, this)) {}
+
+SwapchainMonitor::~SwapchainMonitor() {}
+
+void SwapchainMonitor::WakeupSemaphore::signal(
+ const DeviceContext& device) const {
+
+ const auto ssi =
+ VkSemaphoreSignalInfo{.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO,
+ .semaphore = this->timeline_semaphore,
+ .value = this->value};
+ THROW_NON_VKSUCCESS(device.vtable.SignalSemaphore(device.device, &ssi));
+}
+
+void SwapchainMonitor::do_swapchain_monitor(const std::stop_token stoken) {
+ for (;;) {
+ auto lock = std::unique_lock{this->mutex};
+ this->cv.wait(lock, stoken,
+ [&]() { return !this->wakeup_semaphores.empty(); });
+
+ if (stoken.stop_requested()) {
+ // Small chance an application might need outstanding semaphores
+ // to be signalled if it's closing to avoid a hang.
+ break;
+ }
+
+ // Look for the latest submission and make sure it's completed.
+ if (!this->in_flight_submissions.empty()) {
+ const auto submission = this->in_flight_submissions.back();
+ this->in_flight_submissions.clear();
+
+ if (!submission->empty()) {
+ submission->back()->tail_handle->await_time();
+ }
+ }
+
+ // We might want to signal them all? In theory it's the same timeline
+ // semaphore so obviously it's redundant to signal them one by one. In
+ // almost all cases, there should just be one here anyway.
+ const auto wakeup_semaphore = this->wakeup_semaphores.back();
+ wakeup_semaphores.clear();
+
+ wakeup_semaphore.signal(this->device);
+ }
+}
+
+void SwapchainMonitor::update_params(
+ const bool was_low_latency_requested,
+ const std::chrono::milliseconds present_delay) {
+
+ const auto lock = std::scoped_lock{this->mutex};
+
+ this->was_low_latency_requested = was_low_latency_requested;
+ this->present_delay = present_delay;
+}
+
+void SwapchainMonitor::notify_semaphore(const VkSemaphore& timeline_semaphore,
+ const std::uint64_t& value) {
+
+ const auto lock = std::scoped_lock{this->mutex};
+
+ const auto wakeup_semaphore = WakeupSemaphore{
+ .timeline_semaphore = timeline_semaphore, .value = value};
+
+ // Signal immediately if low_latency isn't requested or if we have no
+ // outstanding work.
+ if (!this->was_low_latency_requested ||
+ this->in_flight_submissions.empty()) {
+
+ wakeup_semaphore.signal(this->device);
+ return;
+ }
+
+ this->wakeup_semaphores.emplace_back(timeline_semaphore, value);
+ this->cv.notify_one();
+}
+
+void SwapchainMonitor::notify_present(
+ const QueueContext::submissions_t& submissions) {
+
+ const auto lock = std::scoped_lock{this->mutex};
+
+ // Fast path where this work has already completed.
+ if (!this->wakeup_semaphores.empty() && !submissions->empty()) {
+
+ const auto& finished = submissions->back()->tail_handle->get_time();
+ if (finished.has_value()) {
+ this->wakeup_semaphores.back().signal(this->device);
+ this->wakeup_semaphores.clear();
+ return;
+ }
+ }
+
+ this->in_flight_submissions.emplace_back(submissions);
+ this->cv.notify_one();
+}
+
+} // namespace low_latency \ No newline at end of file
diff --git a/src/swapchain_monitor.hh b/src/swapchain_monitor.hh
new file mode 100644
index 0000000..5678630
--- /dev/null
+++ b/src/swapchain_monitor.hh
@@ -0,0 +1,69 @@
+#ifndef SWAPCHAIN_MONITOR_HH_
+#define SWAPCHAIN_MONITOR_HH_
+
+// The purpose of this file is to provide a SwapchainMonitor class definition.
+
+#include <vulkan/vulkan_core.h>
+
+#include <chrono>
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <thread>
+
+#include "queue_context.hh"
+
+namespace low_latency {
+
+class DeviceContext;
+
+// A swapchain monitor's job is to provide asynchronous wakeups for threads
+// which request low_latency once the previous presentation has completed.
+// It does this by signalling a semaphore a la VK_NV_low_latency2.
+class SwapchainMonitor {
+ private:
+ const DeviceContext& device;
+
+ // Configurarable params for this swapchain.
+ std::chrono::milliseconds present_delay = std::chrono::milliseconds{0};
+ bool was_low_latency_requested = false;
+
+ struct WakeupSemaphore {
+ VkSemaphore timeline_semaphore;
+ std::uint64_t value;
+
+ public:
+ void signal(const DeviceContext& device) const;
+ };
+ std::deque<WakeupSemaphore> wakeup_semaphores;
+ std::deque<QueueContext::submissions_t> in_flight_submissions;
+
+ std::mutex mutex;
+ std::condition_variable_any cv;
+ std::jthread swapchain_worker;
+
+ private:
+ void do_swapchain_monitor(const std::stop_token stoken);
+
+ public:
+ SwapchainMonitor(const DeviceContext& device,
+ const bool was_low_latency_requested);
+ SwapchainMonitor(const SwapchainMonitor&);
+ SwapchainMonitor(SwapchainMonitor&&);
+ SwapchainMonitor operator=(const SwapchainMonitor&);
+ SwapchainMonitor operator=(SwapchainMonitor&&);
+ ~SwapchainMonitor();
+
+ public:
+ void update_params(const bool was_low_latency_requested,
+ const std::chrono::milliseconds present_delay);
+
+ void notify_semaphore(const VkSemaphore& timeline_semaphore,
+ const std::uint64_t& value);
+
+ void notify_present(const QueueContext::submissions_t& submissions);
+};
+
+} // namespace low_latency
+
+#endif \ No newline at end of file
diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc
index 247d411..a37b2bc 100644
--- a/src/timestamp_pool.cc
+++ b/src/timestamp_pool.cc
@@ -152,19 +152,18 @@ void TimestampPool::Handle::setup_command_buffers(
THROW_NON_VKSUCCESS(vtable.EndCommandBuffer(tail.command_buffer));
}
-std::optional<DeviceContext::Clock::time_point_t>
-TimestampPool::Handle::get_time() {
- const auto& device_ctx = this->timestamp_pool.queue_context.device_context;
- const auto& vtable = device_ctx.vtable;
+struct QueryResult {
+ std::uint64_t value;
+ std::uint64_t available;
+};
+std::optional<DeviceClock::time_point_t> TimestampPool::Handle::get_time() {
+ const auto& context = this->timestamp_pool.queue_context.device_context;
+ const auto& vtable = context.vtable;
- struct QueryResult {
- std::uint64_t value;
- std::uint64_t available;
- };
auto query_result = QueryResult{};
const auto result = vtable.GetQueryPoolResults(
- device_ctx.device, query_pool,
+ context.device, query_pool,
static_cast<std::uint32_t>(this->query_index), 1, sizeof(query_result),
&query_result, sizeof(query_result),
VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT);
@@ -177,30 +176,31 @@ TimestampPool::Handle::get_time() {
return std::nullopt;
}
- return device_ctx.clock->ticks_to_time(query_result.value);
+ return context.clock->ticks_to_time(query_result.value);
}
-std::optional<DeviceContext::Clock::time_point_t>
-TimestampPool::Handle::get_time_spinlock(
- const DeviceContext::Clock::time_point_t& until) {
+DeviceClock::time_point_t TimestampPool::Handle::await_time() {
+ const auto& context = this->timestamp_pool.queue_context.device_context;
+ const auto& vtable = context.vtable;
- auto time = this->get_time();
- for (; !time.has_value(); time = this->get_time()) {
- if (const auto now = DeviceContext::Clock::now(); now >= until) {
- break;
- }
- }
- return time;
-}
+ struct QueryResult {
+ std::uint64_t value;
+ std::uint64_t available;
+ };
+ auto query_result = QueryResult{};
-DeviceContext::Clock::time_point_t TimestampPool::Handle::get_time_spinlock() {
- constexpr auto max = DeviceContext::Clock::time_point_t::max();
- const auto time = this->get_time_spinlock(max);
- assert(time.has_value());
- return *time;
+ THROW_NON_VKSUCCESS(vtable.GetQueryPoolResults(
+ context.device, query_pool,
+ static_cast<std::uint32_t>(this->query_index), 1, sizeof(query_result),
+ &query_result, sizeof(query_result),
+ VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
+ VK_QUERY_RESULT_WAIT_BIT));
+ assert(query_result.available);
+
+ return context.clock->ticks_to_time(query_result.value);
}
-DeviceContext::Clock::time_point_t TimestampPool::Handle::get_time_required() {
+DeviceClock::time_point_t TimestampPool::Handle::get_time_required() {
const auto time = this->get_time();
assert(time.has_value());
return *time;
diff --git a/src/timestamp_pool.hh b/src/timestamp_pool.hh
index 67b34de..0d6c52d 100644
--- a/src/timestamp_pool.hh
+++ b/src/timestamp_pool.hh
@@ -18,11 +18,12 @@
#include <unordered_set>
#include <vector>
-#include "device_context.hh"
+#include "device_clock.hh"
namespace low_latency {
class QueueContext;
+class DeviceContext;
class TimestampPool final {
private:
@@ -119,20 +120,15 @@ class TimestampPool final {
void setup_command_buffers(const Handle& tail,
const QueueContext& queue_context) const;
- // Attempts to get_time, but returns an optional if it's not available
- // yet.
- std::optional<DeviceContext::Clock::time_point_t> get_time();
-
- // Calls get_time() repeatedly under a spinlock, or gives up at
- // time_point_t and returns std::nullopt.
- std::optional<DeviceContext::Clock::time_point_t>
- get_time_spinlock(const DeviceContext::Clock::time_point_t& until);
+ public:
+ // Attempts to get the time - optional if it's not available yet.
+ std::optional<DeviceClock::time_point_t> get_time();
- // Calls get_time() repeatedly under a spinlock until it's available.
- DeviceContext::Clock::time_point_t get_time_spinlock();
+ // Waits until the time is available and returns it.
+ DeviceClock::time_point_t await_time();
// Calls get_time with the assumption it's already available.
- DeviceContext::Clock::time_point_t get_time_required();
+ DeviceClock::time_point_t get_time_required();
};
public: