aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/context.hh5
-rw-r--r--src/device_clock.cc3
-rw-r--r--src/device_context.cc34
-rw-r--r--src/device_context.hh9
-rw-r--r--src/helper.hh5
-rw-r--r--src/layer.cc132
-rw-r--r--src/layer_context.cc7
-rw-r--r--src/layer_context.hh15
-rw-r--r--src/physical_device_context.cc14
-rw-r--r--src/physical_device_context.hh10
-rw-r--r--src/queue_context.cc36
-rw-r--r--src/queue_context.hh8
-rw-r--r--src/swapchain_monitor.cc18
-rw-r--r--src/swapchain_monitor.hh4
-rw-r--r--src/timestamp_pool.cc50
-rw-r--r--src/timestamp_pool.hh13
16 files changed, 157 insertions, 206 deletions
diff --git a/src/context.hh b/src/context.hh
index 6524984..718b697 100644
--- a/src/context.hh
+++ b/src/context.hh
@@ -3,11 +3,6 @@
namespace low_latency {
-#define THROW_NON_VKSUCCESS(x) \
- if (const auto result = x; result != VK_SUCCESS) { \
- throw result; \
- }
-
// A context class doesn't do much by itself. We just use it to provide a
// virtual destructor so we can store a bunch of shared_ptrs in the same
// container and rely on RTTI in the layer context. It also deletes the copy and
diff --git a/src/device_clock.cc b/src/device_clock.cc
index 52c86d3..8e0e408 100644
--- a/src/device_clock.cc
+++ b/src/device_clock.cc
@@ -1,5 +1,6 @@
#include "device_clock.hh"
#include "device_context.hh"
+#include "helper.hh"
#include <vulkan/vulkan_core.h>
@@ -37,7 +38,7 @@ void DeviceClock::calibrate() {
};
auto calibrated_result = CalibratedResult{};
- THROW_NON_VKSUCCESS(device.vtable.GetCalibratedTimestampsKHR(
+ THROW_NOT_VKSUCCESS(device.vtable.GetCalibratedTimestampsKHR(
device.device, 2, std::data(infos), &calibrated_result.device,
&this->error_bound));
diff --git a/src/device_context.cc b/src/device_context.cc
index 5438e40..e2f2a4a 100644
--- a/src/device_context.cc
+++ b/src/device_context.cc
@@ -28,40 +28,6 @@ DeviceContext::~DeviceContext() {
}
}
-/*
-void DeviceContext::sleep_in_input() {
- // TODO
-
- // Present hasn't happened yet, we don't know what queue to attack.
- if (!this->present_queue) {
- return;
- }
-
- const auto& frames = this->present_queue->in_flight_frames;
- // No frame here means we're behind the GPU and do not need to delay.
- // If anything we should speed up...
- if (!std::size(frames)) {
- return;
- }
-
- // If we're here, that means that there might be an outstanding frame that's
- // sitting on our present_queue which hasn't yet completed, so we need to
- // stall until it's finished.
- const auto& last_frame = frames.back();
- assert(std::size(last_frame.submissions));
- const auto& last_frame_submission = last_frame.submissions.back();
- last_frame_submission->end_handle->get_time_spinlock();
-
- // From our sleep in present implementation, just spinning until
- // the previous frame has completed did not work well. This was because
- // there was a delay between presentation and when new work was given
- // to the GPU. If we stalled the CPU without trying to account for this, we
- // would get huge frame drops, loss of throughput, and the GPU would even
- // clock down. So naturally I am concerned about this approach, but it seems
- // to perform well so far in my own testing and is just beautifully elegant.
-}
-*/
-
void DeviceContext::update_params(
const std::optional<VkSwapchainKHR> target,
const std::chrono::milliseconds& present_delay,
diff --git a/src/device_context.hh b/src/device_context.hh
index 172801c..0e0a4eb 100644
--- a/src/device_context.hh
+++ b/src/device_context.hh
@@ -24,7 +24,8 @@ class DeviceContext final : public Context {
InstanceContext& instance;
PhysicalDeviceContext& physical_device;
- // Whether or not we were asked to do NV_VK_LowLatency2 or VK_AMD_anti_lag.
+ // Whether or not we were asked to do NV_VK_LowLatency2 or VK_AMD_anti_lag
+ // at the device level.
const bool was_capability_requested;
const VkDevice device;
@@ -39,13 +40,13 @@ class DeviceContext final : public Context {
public:
DeviceContext(InstanceContext& parent_instance,
PhysicalDeviceContext& parent_physical,
- const VkDevice& device, const bool was_antilag_requested,
+ const VkDevice& device, const bool was_capability_requested,
VkuDeviceDispatchTable&& vtable);
virtual ~DeviceContext();
public:
- // Updates the settings associated with that swapchain. If none is provided
- // all swapchains are set to this value.
+ // Updates the settings associated with that swapchain. If no swapchain
+ // target is provided all swapchains are set to this value.
void update_params(const std::optional<VkSwapchainKHR> target,
const std::chrono::milliseconds& present_delay,
const bool was_low_latency_requested);
diff --git a/src/helper.hh b/src/helper.hh
index 468f146..6dde9be 100644
--- a/src/helper.hh
+++ b/src/helper.hh
@@ -8,6 +8,11 @@
namespace low_latency {
+#define THROW_NOT_VKSUCCESS(x) \
+ if (const auto result = x; result != VK_SUCCESS) { \
+ throw result; \
+ }
+
// Small templates which allow us to SFINAE find pNext structs.
template <typename T>
static T* find_next(void* const head, const VkStructureType& stype) {
diff --git a/src/layer.cc b/src/layer.cc
index 7a7ffc8..813c267 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -78,8 +78,6 @@ CreateInstance(const VkInstanceCreateInfo* pCreateInfo,
INSTANCE_VTABLE_LOAD(GetPhysicalDeviceQueueFamilyProperties2);
INSTANCE_VTABLE_LOAD(GetPhysicalDeviceFeatures2);
INSTANCE_VTABLE_LOAD(GetPhysicalDeviceSurfaceCapabilities2KHR);
- INSTANCE_VTABLE_LOAD(GetPhysicalDeviceProperties2);
- INSTANCE_VTABLE_LOAD(GetPhysicalDeviceProperties2KHR);
#undef INSTANCE_VTABLE_LOAD
const auto lock = std::scoped_lock{layer_context.mutex};
@@ -102,11 +100,13 @@ DestroyInstance(VkInstance instance, const VkAllocationCallbacks* allocator) {
// Erase our physical devices owned by this instance from the global
// context.
for (const auto& [key, _] : context->phys_devices) {
- assert(layer_context.contexts.erase(key));
+ assert(layer_context.contexts.contains(key));
+ layer_context.contexts.erase(key);
}
const auto key = layer_context.get_key(instance);
- assert(layer_context.contexts.erase(key));
+ assert(layer_context.contexts.contains(key));
+ layer_context.contexts.erase(key);
// Should be the last ptr now like DestroyDevice.
assert(context.unique());
@@ -154,23 +154,6 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
const auto requested = std::unordered_set<std::string_view>(
std::from_range, enabled_extensions);
- // There's the antilag extension that might be requested here - Antilag2.
- // Then there's the other thing we provide, which is our AntiLag1
- // equivalent. Calling them AL1 and AL2, where AL1 is requested via
- // an env var and AL2 is requested at the device level via the extension,
- // the cases where we exit with a bad code or deliberately no-op are:
- //
- // !SUPPORTED && !AL2 && AL1 -> No-op hooks
- // !AL2 && !AL1 -> No-op hooks.
- // !SUPPORTED && AL2 -> VK_ERROR_INITIALIZATION_FAILED
- //
- // Note that even though the user has explicitly enabled AL1 via an env var,
- // failing hard here by returning INIT_FAILED if the device doesn't support
- // it is wrong. The vulkan application could just be creating a device that
- // cannot support it which is unrelated to anything present related. This
- // is not the case with AL2, because the vulkan application has to
- // explicitly ask for the extension when it creates the device.
-
const auto was_capability_requested =
requested.contains(VK_AMD_ANTI_LAG_EXTENSION_NAME) ||
requested.contains(VK_NV_LOW_LATENCY_2_EXTENSION_NAME);
@@ -204,11 +187,9 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
// Only append the extra extension if it wasn't already asked for.
for (const auto& wanted : PhysicalDeviceContext::required_extensions) {
- if (requested.contains(wanted)) {
- continue;
+ if (!requested.contains(wanted)) {
+ next_extensions.push_back(wanted);
}
-
- next_extensions.push_back(wanted);
}
return next_extensions;
@@ -284,14 +265,16 @@ DestroyDevice(VkDevice device, const VkAllocationCallbacks* allocator) {
// Remove all owned queues from our global context pool.
for (const auto& [queue, _] : device_context->queues) {
const auto key = layer_context.get_key(queue);
- assert(layer_context.contexts.erase(key));
+ assert(layer_context.contexts.contains(key));
+ layer_context.contexts.erase(key);
}
const auto key = layer_context.get_key(device);
- assert(layer_context.contexts.erase(key));
+ assert(layer_context.contexts.contains(key));
+ layer_context.contexts.erase(key);
- // should be the last shared ptr now, so its destructor can be called.
- // the destructor should expect its owned queues to be unique as well!
+ // Should be the last shared ptr now, so its destructor can be called.
+ // The destructor should expect its owned queues to be unique as well.
assert(device_context.unique());
return func;
@@ -361,7 +344,7 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
const VkSubmitInfo* submit_infos, VkFence fence) {
const auto context = layer_context.get_context(queue);
- const auto& vtable = context->device_context.vtable;
+ const auto& vtable = context->device.vtable;
if (!submit_count || !context->should_inject_timestamps()) {
return vtable.QueueSubmit(queue, submit_count, submit_infos, fence);
@@ -447,7 +430,7 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
const VkSubmitInfo2* submit_infos, VkFence fence) {
const auto context = layer_context.get_context(queue);
- const auto& vtable = context->device_context.vtable;
+ const auto& vtable = context->device.vtable;
if (!submit_count || !context->should_inject_timestamps()) {
return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
@@ -511,7 +494,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL
vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) {
const auto context = layer_context.get_context(queue);
- const auto& vtable = context->device_context.vtable;
+ const auto& vtable = context->device.vtable;
if (const auto res = vtable.QueuePresentKHR(queue, present_info);
res != VK_SUCCESS) {
@@ -524,7 +507,11 @@ vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) {
for (auto i = std::uint32_t{0}; i < present_info->swapchainCount; ++i) {
const auto& swapchain = present_info->pSwapchains[i];
+
+ // For VK_AMD_anti_lag, providing a pPresentId isn't part of the spec.
+ // So we just set it to 0 if it isn't provided.
const auto present_id = pid ? pid->pPresentIds[i] : 0;
+
context->notify_present(swapchain, present_id);
}
@@ -549,9 +536,9 @@ static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceExtensionProperties(
physical_device, pLayerName, pPropertyCount, pProperties);
}
- // If we're spoofing nvidia we want to provide their extension instead.
+ // If we're exposing reflex we want to provide their extension instead.
const auto extension_properties = [&]() -> VkExtensionProperties {
- if (context->instance.layer.should_spoof_nvidia) {
+ if (context->instance.layer.should_expose_reflex) {
return {.extensionName = VK_NV_LOW_LATENCY_2_EXTENSION_NAME,
.specVersion = VK_NV_LOW_LATENCY_2_SPEC_VERSION};
}
@@ -561,13 +548,12 @@ static VKAPI_ATTR VkResult VKAPI_CALL EnumerateDeviceExtensionProperties(
if (pLayerName) {
// This query is for our layer specifically.
-
- if (!pProperties) { // Querying how much space they need.
+ if (!pProperties) {
*pPropertyCount = 1;
return VK_SUCCESS;
}
- if (!*pPropertyCount) { // They gave us zero space to work with.
+ if (!*pPropertyCount) {
return VK_INCOMPLETE;
}
@@ -618,8 +604,7 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2(
vtable.GetPhysicalDeviceFeatures2(physical_device, pFeatures);
// We're going to use this feature for both VK_AMD_anti_lag and
- // VK_NV_low_latency2. It simplifies things a bit if we share a code path
- // for now. TODO remove it in the future for VK_AMD_anti_lag.
+ // VK_NV_low_latency2. It simplifies things a bit if we share a code path.
if (const auto pidf = find_next<VkPhysicalDevicePresentIdFeaturesKHR>(
pFeatures,
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENT_ID_FEATURES_KHR);
@@ -628,10 +613,10 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2(
pidf->presentId = true;
}
- // Don't provide AntiLag if we're trying to spoof nvidia.
- // Nvidia uses VkSurfaceCapabilities2KHR to determine if a surface
- // is capable of reflex instead of AMD's physical device switch found here.
- if (context->instance.layer.should_spoof_nvidia) {
+ // Don't provide AntiLag if we're exposing reflex - VK_NV_low_latency2 uses
+ // VkSurfaceCapabilities2KHR to determine if a surface is capable of reflex
+ // instead of AMD's physical device switch found here.
+ if (context->instance.layer.should_expose_reflex) {
return;
}
@@ -649,29 +634,6 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceFeatures2KHR(
return GetPhysicalDeviceFeatures2(physical_device, pFeatures);
}
-static VKAPI_ATTR void VKAPI_CALL
-GetPhysicalDeviceProperties2(VkPhysicalDevice physical_device,
- VkPhysicalDeviceProperties2* pProperties) {
-
- const auto context = layer_context.get_context(physical_device);
- const auto& vtable = context->instance.vtable;
-
- vtable.GetPhysicalDeviceProperties2(physical_device, pProperties);
-
- constexpr auto NVIDIA_VENDOR_ID = 0x10DE;
- constexpr auto NVIDIA_DEVICE_ID = 0x2684; // rtx 4080 i think?
- if (context->instance.layer.should_spoof_nvidia) {
- pProperties->properties.vendorID = NVIDIA_VENDOR_ID;
- pProperties->properties.deviceID = NVIDIA_DEVICE_ID;
- }
-}
-
-static VKAPI_ATTR void VKAPI_CALL
-GetPhysicalDeviceProperties2KHR(VkPhysicalDevice physical_device,
- VkPhysicalDeviceProperties2* pProperties) {
- return GetPhysicalDeviceProperties2(physical_device, pProperties);
-}
-
static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR(
VkPhysicalDevice physical_device,
const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo,
@@ -684,7 +646,7 @@ static VKAPI_ATTR void VKAPI_CALL GetPhysicalDeviceSurfaceCapabilities2KHR(
physical_device, pSurfaceInfo, pSurfaceCapabilities);
// Don't do this unless we're spoofing nvidia.
- if (!context->instance.layer.should_spoof_nvidia) {
+ if (!context->instance.layer.should_expose_reflex) {
return;
}
@@ -742,8 +704,10 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateSwapchainKHR(
was_low_latency_requested = slci->latencyModeEnable;
}
- context->swapchain_monitors.try_emplace(*pSwapchain, *context,
- was_low_latency_requested);
+
+ const auto [_, did_emplace] = context->swapchain_monitors.try_emplace(
+ *pSwapchain, *context, was_low_latency_requested);
+ assert(did_emplace);
return VK_SUCCESS;
}
@@ -753,7 +717,8 @@ DestroySwapchainKHR(VkDevice device, VkSwapchainKHR swapchain,
const VkAllocationCallbacks* pAllocator) {
const auto context = layer_context.get_context(device);
- assert(context->swapchain_monitors.erase(swapchain));
+ assert(context->swapchain_monitors.contains(swapchain));
+ context->swapchain_monitors.erase(swapchain);
context->vtable.DestroySwapchainKHR(device, swapchain, pAllocator);
}
@@ -765,9 +730,8 @@ AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) {
// AL2 is a synchronous while NVIDIA's low_latencty2 is asynchronous.
// It's difficult to model an asynchronous impl inside a synchronous impl,
- // but it's easy to do the inverse. As a result, we should implement
- // NVIDIA's method and then have a working AL2 implementation follow using
- // that existing code path.
+ // but it's easy to do the inverse. AMD's extension piggybacks on NVIDIA's
+ // more complicated implementation.
const auto present_delay = [&]() {
using namespace std::chrono;
@@ -777,12 +741,18 @@ AntiLagUpdateAMD(VkDevice device, const VkAntiLagDataAMD* pData) {
context->update_params(std::nullopt, present_delay,
(pData->mode == VK_ANTI_LAG_MODE_ON_AMD));
- if (!pData->pPresentationInfo) {
+ if (!pData->pPresentationInfo ||
+ pData->pPresentationInfo->stage != VK_ANTI_LAG_STAGE_INPUT_AMD) {
+
return;
}
- if (pData->pPresentationInfo->stage == VK_ANTI_LAG_STAGE_INPUT_AMD) {
- // TODO use nvidia's path
+ // VK_AMD_anti_lag doesn't provide a swapchain, so we can't map it to
+ // a queue. Our previous implementation used the last queue that presented
+ // and made sure that at least that one completed. I think it's more robust
+ // to make sure they all complete.
+ for (auto& iter : context->swapchain_monitors) {
+ iter.second.wait_until();
}
}
@@ -832,21 +802,18 @@ VkResult SetLatencySleepModeNV(VkDevice device, VkSwapchainKHR swapchain,
// If pSleepModeInfo is nullptr, it means no delay and no low latency.
context->update_params(swapchain, std::chrono::milliseconds{0}, false);
}
+
return VK_SUCCESS;
}
void SetLatencyMarkerNV(VkDevice device, VkSwapchainKHR swapchain,
const VkSetLatencyMarkerInfoNV* pLatencyMarkerInfo) {
// STUB
- // We will probably end up making use of this in the future, but afaict it's
- // not relevant for this layer's operation just yet. This function is
- // NVIDIA's way of giving developers insight into their render pipeline.
}
void GetLatencyTimingsNV(VkDevice device, VkSwapchainKHR swapchain,
VkGetLatencyMarkerInfoNV* pLatencyMarkerInfo) {
// STUB
- // Just like SetLatencyMarkerNV this isn't relevant for us just yet.
}
} // namespace low_latency
@@ -907,11 +874,6 @@ static const auto instance_functions = func_map_t{
HOOK_ENTRY("vkGetPhysicalDeviceSurfaceCapabilities2KHR",
low_latency::GetPhysicalDeviceSurfaceCapabilities2KHR),
-
- HOOK_ENTRY("vkGetPhysicalDeviceProperties2",
- low_latency::GetPhysicalDeviceProperties2),
- HOOK_ENTRY("vkGetPhysicalDeviceProperties2KHR",
- low_latency::GetPhysicalDeviceProperties2KHR),
};
static const auto device_functions = func_map_t{
diff --git a/src/layer_context.cc b/src/layer_context.cc
index 4699202..4399338 100644
--- a/src/layer_context.cc
+++ b/src/layer_context.cc
@@ -4,15 +4,14 @@
#include <string_view>
namespace low_latency {
-
+
LayerContext::LayerContext() {
const auto parse_bool_env = [](const auto& name) -> bool {
const auto env = std::getenv(name);
return env && std::string_view{env} == "1";
};
-
- this->is_antilag_1_enabled = parse_bool_env(SLEEP_AFTER_PRESENT_ENV);
- this->should_spoof_nvidia = parse_bool_env(SPOOF_NVIDIA_ENV);
+
+ this->should_expose_reflex = parse_bool_env(EXPOSE_REFLEX_ENV);
}
LayerContext::~LayerContext() {}
diff --git a/src/layer_context.hh b/src/layer_context.hh
index 95f1cd5..731b273 100644
--- a/src/layer_context.hh
+++ b/src/layer_context.hh
@@ -48,22 +48,15 @@ using dispatch_context_t = typename context_for_t<D>::context;
class LayerContext final : public Context {
private:
- // If this is not null and set to exactly 1, then we should sleep after
- // present.
- static constexpr auto SLEEP_AFTER_PRESENT_ENV =
- "LOW_LATENCY_LAYER_SLEEP_AFTER_PRESENT";
-
- // If this is not null and set to exactly 1, then VK_NV_low_latency2
- // should be provided instead of VK_AMD_anti_lag.
- static constexpr auto SPOOF_NVIDIA_ENV =
- "LOW_LATENCY_LAYER_SPOOF_NVIDIA";
+ // If this is not null and set to 1 then VK_NV_low_latency2 should be
+ // provided instead of VK_AMD_anti_lag.
+ static constexpr auto EXPOSE_REFLEX_ENV = "LOW_LATENCY_LAYER_EXPOSE_REFLEX";
public:
std::mutex mutex;
std::unordered_map<void*, std::shared_ptr<Context>> contexts;
- bool is_antilag_1_enabled = false;
- bool should_spoof_nvidia = false;
+ bool should_expose_reflex = false;
public:
LayerContext();
diff --git a/src/physical_device_context.cc b/src/physical_device_context.cc
index 9c4ad8e..86bf9ab 100644
--- a/src/physical_device_context.cc
+++ b/src/physical_device_context.cc
@@ -1,4 +1,5 @@
#include "physical_device_context.hh"
+#include "helper.hh"
#include <vulkan/vulkan_core.h>
@@ -26,25 +27,24 @@ PhysicalDeviceContext::PhysicalDeviceContext(
vtable.GetPhysicalDeviceQueueFamilyProperties2(physical_device, &count,
nullptr);
- using qp_t = PhysicalDeviceContext::queue_properties_t;
- auto result = qp_t(
+ auto result = std::vector<VkQueueFamilyProperties2>(
count, VkQueueFamilyProperties2{
.sType = VK_STRUCTURE_TYPE_QUEUE_FAMILY_PROPERTIES_2});
vtable.GetPhysicalDeviceQueueFamilyProperties2(physical_device, &count,
std::data(result));
- return std::make_unique<qp_t>(std::move(result));
+ return std::make_unique<std::vector<VkQueueFamilyProperties2>>(
+ std::move(result));
}();
this->supports_required_extensions = [&]() {
auto count = std::uint32_t{};
- THROW_NON_VKSUCCESS(vtable.EnumerateDeviceExtensionProperties(
+ THROW_NOT_VKSUCCESS(vtable.EnumerateDeviceExtensionProperties(
physical_device, nullptr, &count, nullptr));
auto supported_extensions = std::vector<VkExtensionProperties>(count);
- THROW_NON_VKSUCCESS(vtable.EnumerateDeviceExtensionProperties(
- physical_device, nullptr, &count,
- std::data(supported_extensions)));
+ THROW_NOT_VKSUCCESS(vtable.EnumerateDeviceExtensionProperties(
+ physical_device, nullptr, &count, std::data(supported_extensions)));
const auto supported =
supported_extensions |
diff --git a/src/physical_device_context.hh b/src/physical_device_context.hh
index f7ad289..d2e094e 100644
--- a/src/physical_device_context.hh
+++ b/src/physical_device_context.hh
@@ -23,16 +23,12 @@ class PhysicalDeviceContext final : public Context {
public:
InstanceContext& instance;
-
const VkPhysicalDevice physical_device;
- std::unique_ptr<const VkPhysicalDeviceProperties> properties;
-
- using queue_properties_t = std::vector<VkQueueFamilyProperties2>;
- std::unique_ptr<const queue_properties_t> queue_properties;
+ std::unique_ptr<VkPhysicalDeviceProperties> properties;
+ std::unique_ptr<std::vector<VkQueueFamilyProperties2>> queue_properties;
- // Will be true if the physical device supports everything in
- // this->required_extensions.
+ // Will be true if the physical device supports all of required_extensions.
bool supports_required_extensions = false;
public:
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 30e73c1..1192bb6 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -1,44 +1,43 @@
#include "queue_context.hh"
#include "device_context.hh"
-#include "layer_context.hh"
+#include "helper.hh"
#include "timestamp_pool.hh"
#include <span>
+
#include <vulkan/vulkan_core.h>
namespace low_latency {
-QueueContext::CommandPoolOwner::CommandPoolOwner(
- const QueueContext& queue_context)
- : queue_context(queue_context) {
+QueueContext::CommandPoolOwner::CommandPoolOwner(const QueueContext& queue)
+ : queue(queue) {
- const auto& device_context = this->queue_context.device_context;
+ const auto& device_context = this->queue.device;
const auto cpci = VkCommandPoolCreateInfo{
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
- .queueFamilyIndex = queue_context.queue_family_index,
+ .queueFamilyIndex = queue.queue_family_index,
};
- THROW_NON_VKSUCCESS(device_context.vtable.CreateCommandPool(
+ THROW_NOT_VKSUCCESS(device_context.vtable.CreateCommandPool(
device_context.device, &cpci, nullptr, &this->command_pool));
}
QueueContext::CommandPoolOwner::~CommandPoolOwner() {
- const auto& device_context = this->queue_context.device_context;
+ const auto& device_context = this->queue.device;
device_context.vtable.DestroyCommandPool(device_context.device,
this->command_pool, nullptr);
}
-QueueContext::QueueContext(DeviceContext& device_context, const VkQueue& queue,
+QueueContext::QueueContext(DeviceContext& device, const VkQueue& queue,
const std::uint32_t& queue_family_index)
- : device_context(device_context), queue(queue),
- queue_family_index(queue_family_index),
+ : device(device), queue(queue), queue_family_index(queue_family_index),
command_pool(std::make_unique<CommandPoolOwner>(*this)) {
// Only construct a timestamp pool if we support it!
- if (device_context.physical_device.supports_required_extensions) {
+ if (device.physical_device.supports_required_extensions) {
this->timestamp_pool = std::make_unique<TimestampPool>(*this);
}
}
@@ -77,7 +76,6 @@ void QueueContext::notify_submit(
void QueueContext::notify_present(const VkSwapchainKHR& swapchain,
const present_id_t& present_id) {
-
// Notify the device that this swapchain was just presented to.
// We're avoiding a double hash here - don't use operator[] and erase.
auto iter = this->unpresented_submissions.try_emplace(present_id).first;
@@ -86,24 +84,28 @@ void QueueContext::notify_present(const VkSwapchainKHR& swapchain,
std::make_shared<std::deque<std::unique_ptr<Submission>>>();
}
- this->device_context.notify_present(swapchain, iter->second);
+ this->device.notify_present(swapchain, iter->second);
// Important, we nuke the submission because now it's presented.
this->unpresented_submissions.erase(iter);
}
bool QueueContext::should_inject_timestamps() const {
- const auto& physical_device = this->device_context.physical_device;
+ const auto& physical_device = this->device.physical_device;
+ // Our layer is a no-op here if we don't support it.
if (!physical_device.supports_required_extensions) {
return false;
}
// Don't bother injecting timestamps during queue submission if we
// aren't planning on doing anything anyway.
- if (!this->device_context.was_capability_requested &&
- !physical_device.instance.layer.is_antilag_1_enabled) {
+ if (!this->device.was_capability_requested) {
+ return false;
+ }
+ // Don't do it if we've been marked as 'out of band' by nvidia's extension.
+ if (this->should_ignore_latency) {
return false;
}
diff --git a/src/queue_context.hh b/src/queue_context.hh
index 48500e1..a52e718 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -22,18 +22,18 @@ class QueueContext final : public Context {
static constexpr auto MAX_TRACKED_SUBMISSIONS = 50u;
public:
- DeviceContext& device_context;
+ DeviceContext& device;
const VkQueue queue;
const std::uint32_t queue_family_index;
struct CommandPoolOwner final {
private:
- const QueueContext& queue_context;
+ const QueueContext& queue;
VkCommandPool command_pool;
public:
- CommandPoolOwner(const QueueContext& queue_context);
+ CommandPoolOwner(const QueueContext& queue);
CommandPoolOwner(const CommandPoolOwner&) = delete;
CommandPoolOwner(CommandPoolOwner&&) = delete;
CommandPoolOwner operator=(const CommandPoolOwner&) = delete;
@@ -67,7 +67,7 @@ class QueueContext final : public Context {
//
// When our hook sees a VkQueuePresentKHR, we take the provided present_id
// and notify our device that it needs to watch for when this completes.
- // We give it our submission. Now, it's out of our hands. We remove the
+ // We give it our submissions. Now, it's out of our hands. We remove the
// present_id_t mapping when doing so.
struct Submission {
std::shared_ptr<TimestampPool::Handle> head_handle, tail_handle;
diff --git a/src/swapchain_monitor.cc b/src/swapchain_monitor.cc
index 09fa8ba..adeb315 100644
--- a/src/swapchain_monitor.cc
+++ b/src/swapchain_monitor.cc
@@ -1,5 +1,6 @@
#include "swapchain_monitor.hh"
#include "device_context.hh"
+#include "helper.hh"
#include <vulkan/vulkan_core.h>
@@ -23,7 +24,7 @@ void SwapchainMonitor::WakeupSemaphore::signal(
VkSemaphoreSignalInfo{.sType = VK_STRUCTURE_TYPE_SEMAPHORE_SIGNAL_INFO,
.semaphore = this->timeline_semaphore,
.value = this->value};
- THROW_NON_VKSUCCESS(device.vtable.SignalSemaphore(device.device, &ssi));
+ THROW_NOT_VKSUCCESS(device.vtable.SignalSemaphore(device.device, &ssi));
}
void SwapchainMonitor::do_swapchain_monitor(const std::stop_token stoken) {
@@ -109,4 +110,19 @@ void SwapchainMonitor::notify_present(
this->cv.notify_one();
}
+void SwapchainMonitor::wait_until() {
+ // No reason to lock when using VK_AMD_anti_lag.
+ if (this->in_flight_submissions.empty()) {
+ return;
+ }
+
+ const auto last_submissions = this->in_flight_submissions.back();
+ this->in_flight_submissions.clear();
+ if (last_submissions->empty()) {
+ return;
+ }
+
+ last_submissions->back()->tail_handle->await_time();
+}
+
} // namespace low_latency \ No newline at end of file
diff --git a/src/swapchain_monitor.hh b/src/swapchain_monitor.hh
index 5678630..be81d59 100644
--- a/src/swapchain_monitor.hh
+++ b/src/swapchain_monitor.hh
@@ -62,6 +62,10 @@ class SwapchainMonitor {
const std::uint64_t& value);
void notify_present(const QueueContext::submissions_t& submissions);
+
+ public:
+ // Synchronously wait until all in-flight submissions have completed.
+ void wait_until();
};
} // namespace low_latency
diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc
index a37b2bc..4bb236b 100644
--- a/src/timestamp_pool.cc
+++ b/src/timestamp_pool.cc
@@ -1,7 +1,9 @@
#include "timestamp_pool.hh"
#include "device_context.hh"
+#include "helper.hh"
#include "queue_context.hh"
+#include <mutex>
#include <ranges>
#include <span>
#include <vulkan/utility/vk_dispatch_table.h>
@@ -13,18 +15,18 @@ TimestampPool::QueryChunk::QueryPoolOwner::QueryPoolOwner(
const QueueContext& queue_context)
: queue_context(queue_context) {
- const auto& device_context = this->queue_context.device_context;
+ const auto& device_context = this->queue_context.device;
const auto qpci =
VkQueryPoolCreateInfo{.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
.queryType = VK_QUERY_TYPE_TIMESTAMP,
.queryCount = QueryChunk::CHUNK_SIZE};
- THROW_NON_VKSUCCESS(device_context.vtable.CreateQueryPool(
+ THROW_NOT_VKSUCCESS(device_context.vtable.CreateQueryPool(
device_context.device, &qpci, nullptr, &this->query_pool));
}
TimestampPool::QueryChunk::QueryPoolOwner::~QueryPoolOwner() {
- const auto& device_context = this->queue_context.device_context;
+ const auto& device_context = this->queue_context.device;
device_context.vtable.DestroyQueryPool(device_context.device,
this->query_pool, nullptr);
}
@@ -43,7 +45,7 @@ TimestampPool::QueryChunk::CommandBuffersOwner::CommandBuffersOwner(
const QueueContext& queue_context)
: queue_context(queue_context), command_buffers(CHUNK_SIZE) {
- const auto& device_context = queue_context.device_context;
+ const auto& device_context = queue_context.device;
const auto cbai = VkCommandBufferAllocateInfo{
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
@@ -51,12 +53,12 @@ TimestampPool::QueryChunk::CommandBuffersOwner::CommandBuffersOwner(
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
.commandBufferCount = CHUNK_SIZE,
};
- THROW_NON_VKSUCCESS(device_context.vtable.AllocateCommandBuffers(
+ THROW_NOT_VKSUCCESS(device_context.vtable.AllocateCommandBuffers(
device_context.device, &cbai, std::data(this->command_buffers)));
}
TimestampPool::QueryChunk::CommandBuffersOwner::~CommandBuffersOwner() {
- const auto& device_context = this->queue_context.device_context;
+ const auto& device_context = this->queue_context.device;
device_context.vtable.FreeCommandBuffers(
device_context.device, *this->queue_context.command_pool,
@@ -64,6 +66,13 @@ TimestampPool::QueryChunk::CommandBuffersOwner::~CommandBuffersOwner() {
std::data(this->command_buffers));
}
+VkCommandBuffer TimestampPool::QueryChunk::CommandBuffersOwner::operator[](
+ const std::size_t& i) {
+
+ assert(i < CHUNK_SIZE);
+ return this->command_buffers[i];
+}
+
TimestampPool::QueryChunk::~QueryChunk() {}
TimestampPool::TimestampPool(QueueContext& queue_context)
@@ -75,6 +84,7 @@ TimestampPool::TimestampPool(QueueContext& queue_context)
}
std::shared_ptr<TimestampPool::Handle> TimestampPool::acquire() {
+ const auto lock = std::scoped_lock{this->mutex};
// Gets the empty one, or inserts a new one and returns it.
const auto not_empty_iter = [this]() -> auto {
@@ -97,12 +107,12 @@ std::shared_ptr<TimestampPool::Handle> TimestampPool::acquire() {
// Grab any element from our set and erase it immediately after.
auto& indices = *(*not_empty_iter)->free_indices;
const auto query_index = *std::begin(indices);
- assert(indices.erase(query_index));
+ indices.erase(query_index);
return std::make_shared<Handle>(*this, *not_empty_iter, query_index);
}
-TimestampPool::Handle::Handle(const TimestampPool& timestamp_pool,
+TimestampPool::Handle::Handle(TimestampPool& timestamp_pool,
const std::shared_ptr<QueryChunk>& origin_chunk,
const std::uint64_t& query_index)
: timestamp_pool(timestamp_pool), origin_chunk(origin_chunk),
@@ -110,10 +120,12 @@ TimestampPool::Handle::Handle(const TimestampPool& timestamp_pool,
command_buffer((*origin_chunk->command_buffers)[query_index]) {}
TimestampPool::Handle::~Handle() {
+ const auto lock = std::scoped_lock{this->timestamp_pool.mutex};
+
// Parent destructing shouldn't mean we should have a bunch of
// insertions for zero reason.
if (const auto ptr = this->origin_chunk.lock(); ptr) {
- assert(ptr->free_indices->insert(this->query_index).second);
+ ptr->free_indices->insert(this->query_index);
}
}
@@ -124,32 +136,32 @@ void TimestampPool::Handle::setup_command_buffers(
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
};
- const auto& device_context = queue_context.device_context;
+ const auto& device_context = queue_context.device;
const auto& vtable = device_context.vtable;
vtable.ResetQueryPoolEXT(device_context.device, this->query_pool,
static_cast<std::uint32_t>(this->query_index), 1);
- THROW_NON_VKSUCCESS(vtable.ResetCommandBuffer(this->command_buffer, 0));
- THROW_NON_VKSUCCESS(vtable.BeginCommandBuffer(this->command_buffer, &cbbi));
+ THROW_NOT_VKSUCCESS(vtable.ResetCommandBuffer(this->command_buffer, 0));
+ THROW_NOT_VKSUCCESS(vtable.BeginCommandBuffer(this->command_buffer, &cbbi));
vtable.CmdWriteTimestamp2KHR(
this->command_buffer, VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT,
this->query_pool, static_cast<std::uint32_t>(this->query_index));
- THROW_NON_VKSUCCESS(vtable.EndCommandBuffer(this->command_buffer));
+ THROW_NOT_VKSUCCESS(vtable.EndCommandBuffer(this->command_buffer));
vtable.ResetQueryPoolEXT(device_context.device, tail.query_pool,
static_cast<std::uint32_t>(tail.query_index), 1);
- THROW_NON_VKSUCCESS(vtable.ResetCommandBuffer(tail.command_buffer, 0));
- THROW_NON_VKSUCCESS(vtable.BeginCommandBuffer(tail.command_buffer, &cbbi));
+ THROW_NOT_VKSUCCESS(vtable.ResetCommandBuffer(tail.command_buffer, 0));
+ THROW_NOT_VKSUCCESS(vtable.BeginCommandBuffer(tail.command_buffer, &cbbi));
vtable.CmdWriteTimestamp2KHR(
tail.command_buffer, VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT,
tail.query_pool, static_cast<std::uint32_t>(tail.query_index));
- THROW_NON_VKSUCCESS(vtable.EndCommandBuffer(tail.command_buffer));
+ THROW_NOT_VKSUCCESS(vtable.EndCommandBuffer(tail.command_buffer));
}
struct QueryResult {
@@ -157,7 +169,7 @@ struct QueryResult {
std::uint64_t available;
};
std::optional<DeviceClock::time_point_t> TimestampPool::Handle::get_time() {
- const auto& context = this->timestamp_pool.queue_context.device_context;
+ const auto& context = this->timestamp_pool.queue_context.device;
const auto& vtable = context.vtable;
auto query_result = QueryResult{};
@@ -180,7 +192,7 @@ std::optional<DeviceClock::time_point_t> TimestampPool::Handle::get_time() {
}
DeviceClock::time_point_t TimestampPool::Handle::await_time() {
- const auto& context = this->timestamp_pool.queue_context.device_context;
+ const auto& context = this->timestamp_pool.queue_context.device;
const auto& vtable = context.vtable;
struct QueryResult {
@@ -189,7 +201,7 @@ DeviceClock::time_point_t TimestampPool::Handle::await_time() {
};
auto query_result = QueryResult{};
- THROW_NON_VKSUCCESS(vtable.GetQueryPoolResults(
+ THROW_NOT_VKSUCCESS(vtable.GetQueryPoolResults(
context.device, query_pool,
static_cast<std::uint32_t>(this->query_index), 1, sizeof(query_result),
&query_result, sizeof(query_result),
diff --git a/src/timestamp_pool.hh b/src/timestamp_pool.hh
index 0d6c52d..d8ee359 100644
--- a/src/timestamp_pool.hh
+++ b/src/timestamp_pool.hh
@@ -3,7 +3,7 @@
// The purpose of this file is to provide the definition of a 'timestamp pool'.
// It manages blocks of timestamp query pools, hands them out when requested,
-// and allocates more when (if) we run out.
+// and allocates more when (if) we run out. It _should_ be thread safe.
// Usage:
// 1. Get handle with .acquire().
// 2. Write start/end timestamp operations with the handle's pool and index
@@ -15,6 +15,7 @@
#include <vulkan/vulkan.hpp>
#include <memory>
+#include <mutex>
#include <unordered_set>
#include <vector>
@@ -28,6 +29,7 @@ class DeviceContext;
class TimestampPool final {
private:
QueueContext& queue_context;
+ std::mutex mutex;
// A chunk of data which is useful for making timestamp queries.
// Allows association of an index to a query pool and command buffer.
@@ -72,10 +74,7 @@ class TimestampPool final {
~CommandBuffersOwner();
public:
- VkCommandBuffer operator[](const std::size_t& i) {
- assert(i < CHUNK_SIZE);
- return this->command_buffers[i];
- }
+ VkCommandBuffer operator[](const std::size_t& i);
};
std::unique_ptr<CommandBuffersOwner> command_buffers;
@@ -98,7 +97,7 @@ class TimestampPool final {
friend class TimestampPool;
private:
- const TimestampPool& timestamp_pool;
+ TimestampPool& timestamp_pool;
const std::weak_ptr<QueryChunk> origin_chunk;
public:
@@ -107,7 +106,7 @@ class TimestampPool final {
const VkCommandBuffer command_buffer;
public:
- Handle(const TimestampPool& timestamp_pool,
+ Handle(TimestampPool& timestamp_pool,
const std::shared_ptr<QueryChunk>& origin_chunk,
const std::uint64_t& query_index);
Handle(const Handle& handle) = delete;