aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/device_context.cc67
-rw-r--r--src/device_context.hh8
-rw-r--r--src/layer.cc38
-rw-r--r--src/queue_context.cc93
-rw-r--r--src/queue_context.hh29
-rw-r--r--src/timestamp_pool.cc8
6 files changed, 98 insertions, 145 deletions
diff --git a/src/device_context.cc b/src/device_context.cc
index 2214b71..b149311 100644
--- a/src/device_context.cc
+++ b/src/device_context.cc
@@ -1,7 +1,6 @@
#include "device_context.hh"
#include "queue_context.hh"
-#include <iostream>
#include <utility>
#include <vulkan/vulkan_core.h>
@@ -23,22 +22,6 @@ DeviceContext::~DeviceContext() {
}
}
-void DeviceContext::notify_acquire(const VkSwapchainKHR& swapchain,
- const std::uint32_t& image_index,
- const VkSemaphore& signal_semaphore) {
-
- /*
- std::cerr << "notify acquire for swapchain: " << swapchain << " : "
- << image_index << '\n';
- std::cerr << " signal semaphore: " << signal_semaphore << '\n';
- */
-
- const auto it = this->swapchain_signals.try_emplace(swapchain).first;
-
- // Doesn't matter if it was already there, overwrite it.
- it->second.insert_or_assign(image_index, signal_semaphore);
-}
-
DeviceContext::Clock::Clock(const DeviceContext& context) : device(context) {
this->calibrate();
}
@@ -92,17 +75,14 @@ DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const {
return time_point_t{delta};
}
-const auto debug_log_time2 = [](auto& stream, const auto& diff) {
- using namespace std::chrono;
- const auto ms = duration_cast<milliseconds>(diff);
- const auto us = duration_cast<microseconds>(diff - ms);
- const auto ns = duration_cast<nanoseconds>(diff - ms - us);
- stream << ms << " " << us << " " << ns << '\n';
-};
+void DeviceContext::notify_acquire(const VkSwapchainKHR& swapchain,
+ const std::uint32_t& image_index,
+ const VkSemaphore& signal_semaphore) {
+ const auto it = this->swapchain_signals.try_emplace(swapchain).first;
-const auto debug_log_time = [](const auto& diff) {
- debug_log_time2(std::cerr, diff);
-};
+ // Doesn't matter if it was already there, overwrite it.
+ it->second.insert_or_assign(image_index, signal_semaphore);
+}
void DeviceContext::sleep_in_input() {
// Present hasn't happened yet, we don't know what queue to attack.
@@ -110,26 +90,35 @@ void DeviceContext::sleep_in_input() {
return;
}
- const auto before = std::chrono::steady_clock::now();
+ const auto& frames = this->present_queue->in_flight_frames;
+ // No frame here means we're behind the GPU and do not need to delay.
+ // If anything we should speed up...
+ if (!std::size(frames)) {
+ return;
+ }
+
// If we're here, that means that there might be an outstanding frame that's
// sitting on our present_queue which hasn't yet completed, so we need to
// stall until it's finished.
- const auto& frames = this->present_queue->in_flight_frames;
- if (std::size(frames)) {
- frames.back().submissions.back()->end_handle->get_time_spinlock();
- }
- const auto after = std::chrono::steady_clock::now();
- //debug_log_time(after - before);
-
- // FIXME this should take into account 'cpu_time', which we currently do not...
- // idk if it matters.
+ const auto& last_frame = frames.back();
+ assert(std::size(last_frame.submissions));
+ const auto& last_frame_submission = frames.back().submissions.back();
+ last_frame_submission->end_handle->get_time_spinlock();
+
+ // From our sleep in present implementation, just spinning until
+ // the previous frame has completed did not work well. This was because
+ // there was a delay between presentation and when new work was given
+ // to the GPU. If we stalled the CPU without trying to account for this, we
+ // would get huge frame drops, loss of throughput, and the GPU would even
+ // clock down. So naturally I am concerned about this approach, but it seems
+ // to perform well so far in my own testing and is just beautifully elegant.
}
void DeviceContext::notify_antilag_update(const VkAntiLagDataAMD& data) {
this->antilag_mode = data.mode;
- this->antilag_fps = data.maxFPS;
+ this->antilag_fps = data.maxFPS; // TODO
- // This might not be provided (probably just to set some settings).
+ // This might not be provided (probably just to set some settings?).
if (!data.pPresentationInfo) {
return;
}
diff --git a/src/device_context.hh b/src/device_context.hh
index c73f97f..37817d5 100644
--- a/src/device_context.hh
+++ b/src/device_context.hh
@@ -2,7 +2,6 @@
#define DEVICE_CONTEXT_HH_
#include <chrono>
-#include <deque>
#include <memory>
#include <unordered_map>
@@ -30,6 +29,8 @@ struct DeviceContext final : public Context {
std::unordered_map<VkQueue, std::shared_ptr<QueueContext>> queues;
// We map swapchains to image indexes and their last signalled semaphore.
+ // FIXME: This isn't used right now, it was formerly used to map queue
+ // submissions but it ended up being unnecessary complexity.
using index_semaphores_t = std::unordered_map<std::uint32_t, VkSemaphore>;
std::unordered_map<VkSwapchainKHR, index_semaphores_t> swapchain_signals;
@@ -54,7 +55,6 @@ struct DeviceContext final : public Context {
};
Clock clock;
-
std::uint32_t antilag_fps = 0;
VkAntiLagModeAMD antilag_mode = VK_ANTI_LAG_MODE_DRIVER_CONTROL_AMD;
@@ -75,9 +75,9 @@ struct DeviceContext final : public Context {
const std::uint32_t& image_index,
const VkSemaphore& signal_semaphore);
- //
+ //
void notify_antilag_update(const VkAntiLagDataAMD& data);
-
+
void notify_queue_present(const QueueContext& queue);
};
diff --git a/src/layer.cc b/src/layer.cc
index 12067a0..aea2154 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -1,6 +1,5 @@
#include "layer.hh"
-#include <iostream>
#include <span>
#include <string_view>
#include <unordered_map>
@@ -302,8 +301,6 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
DEVICE_VTABLE_LOAD(BeginCommandBuffer);
DEVICE_VTABLE_LOAD(EndCommandBuffer);
DEVICE_VTABLE_LOAD(ResetCommandBuffer);
- DEVICE_VTABLE_LOAD(CmdDraw);
- DEVICE_VTABLE_LOAD(CmdDrawIndexed);
DEVICE_VTABLE_LOAD(CmdResetQueryPool);
DEVICE_VTABLE_LOAD(GetDeviceQueue2);
DEVICE_VTABLE_LOAD(QueueSubmit2);
@@ -387,8 +384,7 @@ GetDeviceQueue(VkDevice device, std::uint32_t queue_family_index,
device_context->queues.emplace(*queue, ptr);
}
-// Identical logic to gdq so some amount of duplication, we can't assume gdq1 is
-// available apparently, what do I know?
+// Identical logic to gdq1.
static VKAPI_ATTR void VKAPI_CALL GetDeviceQueue2(
VkDevice device, const VkDeviceQueueInfo2* info, VkQueue* queue) {
@@ -454,11 +450,7 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
const auto& queue_context = layer_context.get_context(queue);
const auto& vtable = queue_context->device_context.vtable;
- if (!submit_count) { // no-op submit we shouldn't worry about
- return vtable.QueueSubmit(queue, submit_count, submit_infos, fence);
- }
-
- if (!queue_context->should_inject_timestamps()) {
+ if (!submit_count || !queue_context->should_inject_timestamps()) {
return vtable.QueueSubmit(queue, submit_count, submit_infos, fence);
}
@@ -476,17 +468,15 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
// alone.
// 2. Semaphores only signal at the end of their work, so we cannot use
// them as a mechanism to know if work has started without doing
- // another dummy submission. This adds complexity and also skews our
- // timestamps slightly.
- // 3. Semaphores can be waited which sounds nice in theory, but in my
- // own testing waiting on semaphores can cause scheduling issues and
- // cause wakeups as late as 1ms from when it was signalled, which is
- // unbelievably bad if we're trying to do frame pacing. This means
- // we are going to have to do a spinlock poll anyway.
- // 4. Guess what info we need? Timestamp information. Guess what
- // supports polling of an availability bit? Timestamp information.
- // Why bother with semaphores at all then? Polling a semaphore might
- // be faster, but the difference appears to be negligible.
+ // another dummy submission. This adds complexity and also might
+ // skew our timestamps slightly as they wouldn't be a part of the
+ // submission which contained those command buffers.
+ // 3. Timestamps support querying if their work has started/ended
+ // as long as we use the vkHostQueryReset extension to reset them
+ // before we consider them queryable. This means we don't need a
+ // 'is it valid to query' timeline semaphore.
+ // 4. The performance impact of using semaphores vs timestamps is
+ // negligable.
using cbs_t = std::vector<VkCommandBuffer>;
auto next_submits = std::vector<VkSubmitInfo>{};
@@ -541,11 +531,7 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
const auto& queue_context = layer_context.get_context(queue);
const auto& vtable = queue_context->device_context.vtable;
- if (!submit_count) {
- return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
- }
-
- if (!queue_context->should_inject_timestamps()) {
+ if (!submit_count || !queue_context->should_inject_timestamps()) {
return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
}
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 388019c..1f798de 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -12,31 +12,29 @@
namespace low_latency {
-static VkCommandPool
-make_command_pool(const DeviceContext& device_context,
- const std::uint32_t& queue_family_index) {
-
- const auto cpci = VkCommandPoolCreateInfo{
- .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
- .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
- VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
- .queueFamilyIndex = queue_family_index,
- };
-
- auto command_pool = VkCommandPool{};
- device_context.vtable.CreateCommandPool(device_context.device, &cpci,
- nullptr, &command_pool);
- return command_pool;
-}
-
QueueContext::QueueContext(DeviceContext& device_context, const VkQueue& queue,
const std::uint32_t& queue_family_index)
: device_context(device_context), queue(queue),
- queue_family_index(queue_family_index),
- // Important we make the command pool before the timestamp pool, because
- // it's a dependency.
- command_pool(make_command_pool(device_context, queue_family_index)),
- timestamp_pool(std::make_unique<TimestampPool>(*this)) {}
+ queue_family_index(queue_family_index) {
+
+ // Important we make the command pool before the timestamp pool, because
+ // it's a dependency.
+ this->command_pool = [&]() {
+ const auto cpci = VkCommandPoolCreateInfo{
+ .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+ .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
+ VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+ .queueFamilyIndex = queue_family_index,
+ };
+
+ auto command_pool = VkCommandPool{};
+ device_context.vtable.CreateCommandPool(device_context.device, &cpci,
+ nullptr, &command_pool);
+ return command_pool;
+ }();
+
+ this->timestamp_pool = std::make_unique<TimestampPool>(*this);
+}
QueueContext::~QueueContext() {
@@ -63,27 +61,15 @@ void QueueContext::notify_submit(
std::span{info.pSignalSemaphores, info.signalSemaphoreCount},
std::inserter(signals, std::end(signals)));
- /*
- std::cerr << "submit1 notif for queue " << this->queue << '\n';
- std::cerr << " signals: \n";
- for (const auto& signal : signals) {
- std::cerr << " " << signal << '\n';
- }
- std::cerr << " waits: \n";
- for (const auto& wait : waits) {
- std::cerr << " " << wait << '\n';
- }
- */
-
this->submissions.emplace_back(std::make_unique<Submission>(
std::move(signals), std::move(waits), head_handle, tail_handle, now));
- // TODO HACK
- if (std::size(this->submissions) > 100) {
+ if (std::size(this->submissions) > this->MAX_TRACKED_SUBMISSIONS) {
this->submissions.pop_front();
}
}
+// Identical to notify_submit, but we use VkSubmitInfo2.
void QueueContext::notify_submit(
const VkSubmitInfo2& info,
const std::shared_ptr<TimestampPool::Handle> head_handle,
@@ -103,23 +89,10 @@ void QueueContext::notify_submit(
std::inserter(signals, std::end(signals)),
[](const auto& info) -> auto { return info.semaphore; });
- /*
- std::cerr << "submit2 notif for queue " << this->queue << '\n';
- std::cerr << " signals: \n";
- for (const auto& signal : signals) {
- std::cerr << " " << signal << '\n';
- }
- std::cerr << " waits: \n";
- for (const auto& wait : waits) {
- std::cerr << " " << wait << '\n';
- }
- */
-
this->submissions.emplace_back(std::make_unique<Submission>(
std::move(signals), std::move(waits), head_handle, tail_handle, now));
- // TODO HACK
- if (std::size(this->submissions) > 100) {
+ if (std::size(this->submissions) > this->MAX_TRACKED_SUBMISSIONS) {
this->submissions.pop_front();
}
}
@@ -139,7 +112,6 @@ void QueueContext::drain_submissions_to_frame() {
const auto start_iter = std::begin(this->submissions);
// no op submit?
if (start_iter == std::end(this->submissions)) {
- std::cerr << "ignored no op submit\n";
return;
}
const auto last_iter = std::prev(std::end(this->submissions));
@@ -208,20 +180,14 @@ void QueueContext::drain_frames_to_timings() {
return;
}
- // We used to collect all devices that were pointed to by all potential
- // submissions, put them in a set and then call.calibrate() on each once.
- // This is unnecessary now - we assume all submissions come from the same
- // queue. FIXME: don't assume this.
- auto& device_context = this->device_context;
- auto& clock = device_context.clock;
- clock.calibrate();
+ // Only need to calibrate this device, we don't support multi device anti
+ // lag.
+ this->device_context.clock.calibrate();
while (std::size(this->in_flight_frames)) {
const auto& frame = this->in_flight_frames.front();
- if (!std::size(frame.submissions)) {
- break;
- }
+ assert(std::size(frame.submissions));
const auto& last_submission = frame.submissions.back();
@@ -335,11 +301,8 @@ void QueueContext::sleep_in_present() {
const auto& device = this->device_context;
const auto& vtable = device.vtable;
- // Call this to push all in flight frames into our timings structure,
- // but only if they're completed. So now they are truly *in flight
- // frames*.
+ // After calling this, any remaining frames are truly *in fligh*.
this->drain_frames_to_timings();
-
if (!std::size(this->in_flight_frames)) {
return;
}
diff --git a/src/queue_context.hh b/src/queue_context.hh
index 2a3ea39..fbb04e8 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -16,6 +16,21 @@
namespace low_latency {
class QueueContext final : public Context {
+ private:
+ // The amount of finished frame timing data we keep before eviction.
+ // For now, this value is also the number of data points used in the
+ // calculation of gpu timing information.
+ static constexpr auto MAX_TRACKED_TIMINGS = 50;
+ // The amount of queue submissions we allow tracked per queue before
+ // we give up tracking them. For a queue that is presented to,
+ // these submissions will be constantly moved to Frame structs so
+ // it's not an issue that we only track so many - unless it just
+ // happens that an application makes an unexpectedly large
+ // amount of vkQueueSubmit's per frame. For queues which don't
+ // present, this limit stops them from growing limitlessly in memory
+ // as we may not necessarily manually evict them yet.
+ static constexpr auto MAX_TRACKED_SUBMISSIONS = 50;
+
public:
DeviceContext& device_context;
@@ -27,8 +42,6 @@ class QueueContext final : public Context {
std::unique_ptr<TimestampPool> timestamp_pool;
public:
- static constexpr auto MAX_TRACKED_TIMINGS = 50;
-
// Potentially in flight queue submissions that come from this queue.
struct Submission {
const std::unordered_set<VkSemaphore> signals;
@@ -36,7 +49,7 @@ class QueueContext final : public Context {
const std::shared_ptr<TimestampPool::Handle> start_handle;
const std::shared_ptr<TimestampPool::Handle> end_handle;
-
+
const DeviceContext::Clock::time_point_t enqueued_time;
};
using submission_ptr_t = std::shared_ptr<Submission>;
@@ -50,8 +63,8 @@ class QueueContext final : public Context {
struct Frame {
std::deque<submission_ptr_t> submissions;
- // the point that control flow was returned from VkQueuePresentKHR back to the
- // application.
+ // the point that control flow was returned from VkQueuePresentKHR back
+ // to the application.
DeviceContext::Clock::time_point_t cpu_post_present_time;
};
std::deque<Frame> in_flight_frames;
@@ -67,11 +80,11 @@ class QueueContext final : public Context {
private:
// Drains submissions and promotes them into a single frame object.
void drain_submissions_to_frame();
-
+
// Drains in flight frames and promotes them into a Timing object if they
// have completed.
void drain_frames_to_timings();
-
+
// Antilag 1 equivalent where we sleep after present to reduce queueing.
void sleep_in_present();
@@ -92,7 +105,7 @@ class QueueContext final : public Context {
const DeviceContext::Clock::time_point_t& now);
void notify_present(const VkPresentInfoKHR& info);
-
+
public:
bool should_inject_timestamps() const;
};
diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc
index 5149747..e8ef9f5 100644
--- a/src/timestamp_pool.cc
+++ b/src/timestamp_pool.cc
@@ -3,8 +3,8 @@
#include "queue_context.hh"
#include <chrono>
-#include <span>
#include <ranges>
+#include <span>
#include <vulkan/utility/vk_dispatch_table.h>
#include <vulkan/vulkan_core.h>
@@ -25,8 +25,10 @@ TimestampPool::QueryChunk::QueryChunk(const QueueContext& queue_context) {
return qp;
}();
- constexpr auto KEY_RANGE = std::views::iota(0u, QueryChunk::CHUNK_SIZE);
- this->free_indices = std::make_unique<free_indices_t>(std::from_range, KEY_RANGE);
+ this->free_indices = []() {
+ constexpr auto KEYS = std::views::iota(0u, QueryChunk::CHUNK_SIZE);
+ return std::make_unique<free_indices_t>(std::from_range, KEYS);
+ }();
this->command_buffers = [&, this]() -> auto {
auto cbs = std::make_unique<std::vector<VkCommandBuffer>>(CHUNK_SIZE);