aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/device_context.cc5
-rw-r--r--src/device_context.hh3
-rw-r--r--src/layer.cc147
-rw-r--r--src/queue_context.cc429
-rw-r--r--src/queue_context.hh36
5 files changed, 258 insertions, 362 deletions
diff --git a/src/device_context.cc b/src/device_context.cc
index f849df1..59d818e 100644
--- a/src/device_context.cc
+++ b/src/device_context.cc
@@ -2,6 +2,7 @@
#include "queue_context.hh"
#include <utility>
+#include <iostream>
namespace low_latency {
@@ -24,6 +25,9 @@ DeviceContext::~DeviceContext() {
void DeviceContext::notify_acquire(const VkSwapchainKHR& swapchain,
const std::uint32_t& image_index,
const VkSemaphore& signal_semaphore) {
+
+ std::cerr << "notify acquire for swapchain: " << swapchain << " : " << image_index << '\n';
+ std::cerr << " signal semaphore: " << signal_semaphore << '\n';
const auto it = this->swapchain_signals.try_emplace(swapchain).first;
@@ -57,6 +61,7 @@ void DeviceContext::Clock::calibrate() {
clock_gettime(CLOCK_MONOTONIC, &tv);
return tv.tv_nsec + tv.tv_sec*1000000000ull;
*/
+
const auto steady_before = std::chrono::steady_clock::now();
device.vtable.GetCalibratedTimestampsKHR(device.device, 2, std::data(infos),
&calibrated_result.device,
diff --git a/src/device_context.hh b/src/device_context.hh
index c08cec2..8a86cfb 100644
--- a/src/device_context.hh
+++ b/src/device_context.hh
@@ -36,7 +36,8 @@ struct DeviceContext final : public Context {
struct Clock {
public:
- using time_point_t = std::chrono::steady_clock::time_point;
+ using time_point_t = std::chrono::time_point<std::chrono::steady_clock,
+ std::chrono::nanoseconds>;
const DeviceContext& device;
public:
diff --git a/src/layer.cc b/src/layer.cc
index 1b1d9e7..f9917f6 100644
--- a/src/layer.cc
+++ b/src/layer.cc
@@ -1,5 +1,6 @@
#include "layer.hh"
+#include <iostream>
#include <memory>
#include <span>
#include <string_view>
@@ -278,7 +279,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(
return result;
}
-
+
#define DEVICE_VTABLE_LOAD(name) \
.name = reinterpret_cast<PFN_vk##name>(gdpa(*pDevice, "vk" #name))
auto vtable = VkuDeviceDispatchTable{
@@ -457,23 +458,19 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
}
// We have to avoid casting away the const* of the passed VkSubmitInfos.
- // We wrap every single submission with *two* extra VkSubmitInfos to
- // accomplish this. The first executes a command buffer that
+ // So we end up copying a lot of stuff and wrapping them in unique_ptrs
+ // so their position in memory is stable.
using cb_vect = std::vector<VkCommandBuffer>;
using tssi_ptr_t = std::unique_ptr<VkTimelineSemaphoreSubmitInfo>;
auto next_submits = std::vector<VkSubmitInfo>{};
auto next_cbs = std::vector<std::unique_ptr<cb_vect>>{};
- auto next_signals = std::vector<std::unique_ptr<std::uint64_t>>{};
- auto next_tssis = std::vector<tssi_ptr_t>{};
auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{};
for (const auto& submit_info : std::span{submit_infos, submit_count}) {
const auto head_handle = queue_context->timestamp_pool->acquire();
const auto tail_handle = queue_context->timestamp_pool->acquire();
- // Head is special as we need to inject a CB into a copy of
- // their command buffers that records the time the waits completed.
next_cbs.emplace_back([&]() -> auto {
auto cbs = std::make_unique<std::vector<VkCommandBuffer>>();
head_handle->setup_command_buffers(*tail_handle, *queue_context);
@@ -488,42 +485,13 @@ vkQueueSubmit(VkQueue queue, std::uint32_t submit_count,
next_submits.back().pCommandBuffers = std::data(*next_cbs.back());
next_submits.back().commandBufferCount = std::size(*next_cbs.back());
- const auto next_signal = 1 + queue_context->semaphore_sequence++;
-
- next_signals.push_back(std::make_unique<std::uint64_t>(next_signal));
-
- next_tssis.push_back(std::make_unique<VkTimelineSemaphoreSubmitInfo>(
- VkTimelineSemaphoreSubmitInfo{
- .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR,
- .signalSemaphoreValueCount = 1,
- .pSignalSemaphoreValues = next_signals.back().get(),
- }));
- next_submits.push_back(VkSubmitInfo{
- .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
- .pNext = next_tssis.back().get(),
- .commandBufferCount = 1,
- .pCommandBuffers = &tail_handle->command_buffer,
- .signalSemaphoreCount = 1,
- .pSignalSemaphores = &queue_context->semaphore,
- });
-
- queue_context->notify_submit(submit_info, next_signal, head_handle,
- tail_handle);
-
+ queue_context->notify_submit(submit_info, head_handle, tail_handle);
handles.push_back(head_handle);
handles.push_back(tail_handle);
}
- if (const auto res = vtable.QueueSubmit(queue, std::size(next_submits),
- std::data(next_submits), fence);
- res != VK_SUCCESS) {
-
- return res;
- }
-
- // ?!?
-
- return VK_SUCCESS;
+ return vtable.QueueSubmit(queue, std::size(next_submits),
+ std::data(next_submits), fence);
}
// The logic for this function is identical to vkSubmitInfo.
@@ -531,71 +499,52 @@ static VKAPI_ATTR VkResult VKAPI_CALL
vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count,
const VkSubmitInfo2* submit_infos, VkFence fence) {
- const auto queue_context = layer_context.get_context(queue);
+ const auto& queue_context = layer_context.get_context(queue);
const auto& vtable = queue_context->device_context.vtable;
- // TODO
- if (!submit_count || true) {
+ if (!submit_count) {
return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence);
}
- /*
- auto timestamp_handle = queue_context->timestamp_pool->acquire();
- timestamp_handle->setup_command_buffers(vtable);
- const auto& [head_cb, tail_cb] = timestamp_handle->command_buffers;
-
- const auto next_command_buffers = [&]() -> auto {
- auto next_command_buffers = std::vector<VkCommandBufferSubmitInfo>{};
- next_command_buffers.push_back(VkCommandBufferSubmitInfo{
- .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
- .commandBuffer = head_cb,
- });
- std::ranges::copy_n(submit_infos[0].pCommandBufferInfos,
- submit_infos[0].commandBufferInfoCount,
- std::back_inserter(next_command_buffers));
- return next_command_buffers;
- }();
+ using cb_vect_t = std::vector<VkCommandBufferSubmitInfo>;
+ auto next_submits = std::vector<VkSubmitInfo2>{};
+ auto next_cbs = std::vector<std::unique_ptr<cb_vect_t>>{};
+ auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{};
- auto next_submit_infos = std::vector<VkSubmitInfo2>();
- std::ranges::copy_n(submit_infos, submit_count,
- std::back_inserter(next_submit_infos));
- next_submit_infos[0].pCommandBufferInfos = std::data(next_command_buffers);
- next_submit_infos[0].commandBufferInfoCount =
- std::size(next_command_buffers);
-
- const auto target_semaphore_sequence =
- 1 + queue_context->semaphore_sequence++;
- const auto tail_ssi = VkSemaphoreSubmitInfo{
- .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
- .semaphore = queue_context->semaphore,
- .value = target_semaphore_sequence,
- .stageMask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
- };
- const auto tail_cbsi = VkCommandBufferSubmitInfo{
- .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
- .commandBuffer = tail_cb,
- };
- next_submit_infos.push_back(VkSubmitInfo2{
- .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO_2,
- .commandBufferInfoCount = 1,
- .pCommandBufferInfos = &tail_cbsi,
- .signalSemaphoreInfoCount = 1,
- .pSignalSemaphoreInfos = &tail_ssi,
- });
-
- if (const auto res =
- vtable.QueueSubmit2(queue, std::size(next_submit_infos),
- std::data(next_submit_infos), fence);
- res != VK_SUCCESS) {
- return res;
- }
+ for (const auto& submit_info : std::span{submit_infos, submit_count}) {
+ const auto head_handle = queue_context->timestamp_pool->acquire();
+ const auto tail_handle = queue_context->timestamp_pool->acquire();
+
+ next_cbs.emplace_back([&]() -> auto {
+ auto cbs = std::make_unique<cb_vect_t>();
+ head_handle->setup_command_buffers(*tail_handle, *queue_context);
+ cbs->push_back(VkCommandBufferSubmitInfo{
+ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+ .commandBuffer = head_handle->command_buffer,
+ });
+ std::ranges::copy_n(submit_info.pCommandBufferInfos,
+ submit_info.commandBufferInfoCount,
+ std::back_inserter(*cbs));
+ cbs->push_back(VkCommandBufferSubmitInfo{
+ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_SUBMIT_INFO,
+ .commandBuffer = tail_handle->command_buffer,
+ });
+ return cbs;
+ }());
- queue_context->notify_submit({submit_infos, submit_count},
- target_semaphore_sequence,
- std::move(timestamp_handle));
+ next_submits.push_back(submit_info);
+ next_submits.back().pCommandBufferInfos = std::data(*next_cbs.back());
+ next_submits.back().commandBufferInfoCount =
+ std::size(*next_cbs.back());
- return VK_SUCCESS;
- */
+ queue_context->notify_submit(submit_info, head_handle, tail_handle);
+
+ handles.push_back(head_handle);
+ handles.push_back(tail_handle);
+ }
+
+ return vtable.QueueSubmit2(queue, std::size(next_submits),
+ std::data(next_submits), fence);
}
static VKAPI_ATTR VkResult VKAPI_CALL
@@ -621,10 +570,10 @@ vkQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* present_info) {
queue_context->notify_present(*present_info);
}
- if (const auto sleep_time = queue_context->get_delay_time();
- sleep_time.has_value()) {
+ if (const auto sleep_until = queue_context->get_sleep_until();
+ sleep_until.has_value()) {
- std::this_thread::sleep_for(*sleep_time);
+ std::this_thread::sleep_until(*sleep_until);
}
return VK_SUCCESS;
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 99cf51e..2b79b53 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -2,8 +2,10 @@
#include "device_context.hh"
#include "timestamp_pool.hh"
+#include <algorithm>
#include <chrono>
#include <iostream>
+#include <span>
namespace low_latency {
@@ -67,7 +69,7 @@ QueueContext::~QueueContext() {
}
void QueueContext::notify_submit(
- const VkSubmitInfo& info, const std::uint64_t& target_semaphore_sequence,
+ const VkSubmitInfo& info,
const std::shared_ptr<TimestampPool::Handle> head_handle,
const std::shared_ptr<TimestampPool::Handle> tail_handle) {
@@ -79,8 +81,7 @@ void QueueContext::notify_submit(
std::inserter(signals, std::end(signals)));
this->submissions.emplace_back(std::make_unique<Submission>(
- std::move(signals), std::move(waits), target_semaphore_sequence,
- head_handle, tail_handle));
+ std::move(signals), std::move(waits), head_handle, tail_handle));
// TODO HACK
if (std::size(this->submissions) > 100) {
@@ -88,40 +89,42 @@ void QueueContext::notify_submit(
}
}
-/*
void QueueContext::notify_submit(
- std::span<const VkSubmitInfo2> infos,
- const std::uint64_t target_semaphore_sequence,
- std::shared_ptr<TimestampPool::Handle>&& handle) {
+ const VkSubmitInfo2& info,
+ const std::shared_ptr<TimestampPool::Handle> head_handle,
+ const std::shared_ptr<TimestampPool::Handle> tail_handle) {
auto signals = std::unordered_set<VkSemaphore>{};
auto waits = std::unordered_set<VkSemaphore>{};
- for (const auto& info : infos) {
- constexpr auto get_semaphore = [](const auto& semaphore_info) {
- return semaphore_info.semaphore;
- };
- std::ranges::transform(info.pSignalSemaphoreInfos,
- std::next(info.pSignalSemaphoreInfos,
- info.signalSemaphoreInfoCount),
- std::inserter(signals, std::end(signals)),
- get_semaphore);
- std::ranges::transform(
- info.pWaitSemaphoreInfos,
- std::next(info.pWaitSemaphoreInfos, info.waitSemaphoreInfoCount),
- std::inserter(waits, std::end(waits)), get_semaphore);
+ std::ranges::transform(
+ std::span{info.pWaitSemaphoreInfos, info.waitSemaphoreInfoCount},
+ std::inserter(waits, std::end(waits)),
+ [](const auto& info) -> auto { return info.semaphore; });
+
+ std::ranges::transform(
+ std::span{info.pSignalSemaphoreInfos, info.signalSemaphoreInfoCount},
+ std::inserter(signals, std::end(signals)),
+ [](const auto& info) -> auto { return info.semaphore; });
+
+ std::cerr << "submit2 notif for queue " << this->queue << '\n';
+ std::cerr << " signals: \n";
+ for (const auto& signal : signals) {
+ std::cerr << " " << signal << '\n';
+ }
+ std::cerr << " waits: \n";
+ for (const auto& wait : waits) {
+ std::cerr << " " << wait << '\n';
}
this->submissions.emplace_back(std::make_unique<Submission>(
- std::move(signals), std::move(waits), target_semaphore_sequence,
- std::move(handle)));
+ std::move(signals), std::move(waits), head_handle, tail_handle));
// TODO HACK
if (std::size(this->submissions) > 100) {
this->submissions.pop_front();
}
}
-*/
void QueueContext::notify_present(const VkPresentInfoKHR& info) {
@@ -153,7 +156,7 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
return collected_semaphores;
}();
- const auto start_iter = std::ranges::find_if(
+ const auto acquire_iter = std::ranges::find_if(
std::rbegin(this->submissions), std::rend(this->submissions),
[&](const auto& submission) {
return std::ranges::any_of(
@@ -162,13 +165,13 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
});
});
- if (start_iter == std::rend(this->submissions)) {
- std::cout << "couldn't find starting submission!\n";
+ if (acquire_iter == std::rend(this->submissions)) {
+ std::cerr << "couldn't find starting submission!\n";
return;
}
- const auto& start = *start_iter;
+ const auto& acquire = *acquire_iter;
- const auto end_iter = std::ranges::find_if(
+ const auto present_iter = std::ranges::find_if(
std::rbegin(this->submissions), std::rend(this->submissions),
[&](const auto& submission) {
return std::ranges::any_of(
@@ -176,43 +179,61 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
[&](const auto& signal) { return waits.contains(signal); });
});
- if (end_iter == std::rend(this->submissions)) {
- std::cout << "couldn't find ending submission!\n";
+ if (present_iter == std::rend(this->submissions)) {
+ std::cerr << "couldn't find ending submission!\n";
return;
}
- const auto& end = *end_iter;
+ const auto& end = *present_iter;
+
+ std::cerr << "present for queue: " << queue << ", our waits:\n";
+ for (const auto& wait : waits) {
+ std::cerr << " " << wait << '\n';
+ }
+
+ // The work including and between acquire -> present is effectively
+ // guaranteed to contribute to our frame. We are going to mark this point
+ // for future queues to read the 'start of frame' from.
+ (*present_iter)->end_of_frame_marker = true;
+
+ // Now we read backwards to try to find our true start, starting at our
+ // acquire.
+ const auto start_iter = std::prev(std::ranges::find_if(
+ std::next(acquire_iter), std::rend(this->submissions),
+ [](const auto& submission) {
+ return submission->end_of_frame_marker;
+ }));
+ const auto& start = *start_iter;
+
+ // start iter can't be end cause it's prev'd.
auto frame = Frame{.start =
Frame::Timepoint{
.context = *this,
.handle = start->start_handle,
- .sequence = start->sequence,
},
.end = Frame::Timepoint{
.context = *this,
.handle = end->end_handle,
- .sequence = end->sequence,
}};
this->in_flight_frames.emplace_back(
std::make_unique<Frame>(std::move(frame)));
-
- // hack
- if (this->in_flight_frames.size() > 5) {
- this->in_flight_frames.pop_front();
- }
}
-std::optional<QueueContext::duration_t> QueueContext::get_delay_time() {
+const auto debug_log_time = [](const auto& diff) {
+ using namespace std::chrono;
+ const auto ms = duration_cast<milliseconds>(diff);
+ const auto us = duration_cast<microseconds>(diff - ms);
+ const auto ns = duration_cast<nanoseconds>(diff - ms - us);
+ std::cerr << ms << " " << us << " " << ns << "\n";
+};
+
+void QueueContext::process_frames() {
if (!std::size(this->in_flight_frames)) {
- return std::nullopt;
+ return;
}
- // We are about to query the wait semaphores of all of our current
- // frames in flight. They may come from the same device, so we're going
- // to build a mapping here to reduce vulkan calls. Not only that,
- // we have to do this or else our timing information becomes broken
- // as this loop iterates.
- const auto target_devices = [this]() -> auto {
+ // Collect all devices and call calibrate.
+ [this]() -> auto {
using context_ref_t = std::reference_wrapper<DeviceContext>;
auto target_devices = std::unordered_map<VkDevice, context_ref_t>{};
for (const auto& frame : this->in_flight_frames) {
@@ -222,237 +243,139 @@ std::optional<QueueContext::duration_t> QueueContext::get_delay_time() {
target_devices.try_emplace(start.device, std::ref(start));
target_devices.try_emplace(end.device, std::ref(end));
}
- return target_devices;
- }();
-
- // Calibrate timestamps before we acquire semaphores.
- for (const auto& pair : target_devices) {
- auto& device = pair.second;
- device_context.clock.calibrate();
- }
-
- // Now we have all owned devices and their clocks are in a good state.
- // We need to build another mapping of semaphores to their queries now.
- const auto queue_sequences = [this]() -> auto {
- auto queue_sequences = std::unordered_map<VkQueue, std::uint64_t>{};
- for (const auto& frame : this->in_flight_frames) {
- auto& start = frame->start.context;
- auto& end = frame->end.context;
-
- for (const auto& queue_ptr : {&start, &end}) {
- if (queue_sequences.contains(queue_ptr->queue)) {
- continue;
- }
-
- const auto& vtable = queue_ptr->device_context.vtable;
- auto seq = std::uint64_t{};
- vtable.GetSemaphoreCounterValueKHR(this->device_context.device,
- this->semaphore, &seq);
- queue_sequences.emplace(queue_ptr->queue, seq);
- }
+ for (const auto& pair : target_devices) {
+ auto& device = pair.second.get();
+ device.clock.calibrate();
}
- return queue_sequences;
}();
- // Now all devices we are about to query are primed to query.
- // We have all sequence numbers from all queus we could possibly query.
- const auto S = std::size(this->in_flight_frames);
- for (auto i = std::size_t{0}; i < S; ++i) {
- assert(this->in_flight_frames[i]);
- const auto& frame = *this->in_flight_frames[i];
- const auto& start = frame.start;
- const auto& end = frame.end;
-
- std::cout << " Evaluating the frame that's " << S - i - 1
- << " behind\n";
-
- std::cout << " target start seq: " << start.sequence << '\n';
- std::cout << " target end seq: " << end.sequence << '\n';
-
- const auto start_seq_it = queue_sequences.find(start.context.queue);
- assert(start_seq_it != std::end(queue_sequences));
- const auto& start_seq = start_seq_it->second;
- if (start_seq < start.sequence) {
- std::cout << " frame hasn't started yet !\n ";
- continue;
- }
+ const auto get_tick_time = [](const auto& timepoint)
+ -> std::optional<DeviceContext::Clock::time_point_t> {
+ const auto& handle = timepoint.handle;
+ const auto& context = timepoint.context;
- /*
- const auto start_ticks_opt =
- start.handle->get_ticks(*start.context.timestamp_pool);
- if (!start_ticks_opt.has_value()) {
- std::cout << " frame hasn't started yet !\n ";
+ const auto ticks = handle->get_ticks(*context.timestamp_pool);
+ if (!ticks.has_value()) {
+ return std::nullopt;
}
+ const auto& clock = context.device_context.clock;
+ return clock.ticks_to_time(*ticks);
+ };
- std::cout << " START TICKS: " << start_ticks << '\n';
- const auto start_time =
- start.context.device_context.clock.ticks_to_time(start_ticks);
-
- {
- using namespace std::chrono;
- const auto diff = now - a;
- const auto ms = duration_cast<milliseconds>(diff);
- const auto us = duration_cast<microseconds>(diff - ms);
- const auto ns = duration_cast<nanoseconds>(diff - ms - us);
- std::cout << " frame started: " << ms << " ms " << us
- << " us " << ns << " ns ago\n";
- }
+ std::cerr << "starting frame readout\n";
+ while (std::size(this->in_flight_frames)) {
+ const auto& frame = this->in_flight_frames.front();
+ assert(frame);
- const auto end_seq_it = queue_sequences.find(end.context.queue);
- assert(end_seq_it != std::end(queue_sequences));
- const auto& end_seq = end_seq_it->second;
- if (start_seq < end.sequence) {
- std::cout << " frame hasn't started yet !\n ";
- continue;
+ const auto a = get_tick_time(frame->start);
+ if (!a.has_value()) {
+ break;
}
- */
- }
-
- return std::nullopt;
- //
-}
-// now it's all coming together
-// std::optional<QueueContext::duration_t> QueueContext::get_delay_time() {
-/*
-if (!std::size(this->in_flight_frames)) {
- return std::nullopt;
-}
-
-auto seq = std::uint64_t{};
-this->device_context.vtable.GetSemaphoreCounterValueKHR(
- this->device_context.device, this->semaphore, &seq);
-
-// Get semaphore first, then poll!
-this->timestamp_pool->poll();
-
-// idk how frequently we should call this.
-this->device_context.calibrate_timestamps();
+ const auto b = get_tick_time(frame->end);
+ if (!b.has_value()) {
+ break;
+ }
-static auto gpu_frametimes = std::deque<uint64_t>{};
-static auto cpu_frametimes = std::deque<uint64_t>{};
+ // assert(a <= b);
-const auto S = std::size(this->in_flight_frames);
+ //
+ const auto last_b =
+ this->timings.empty() ? *a : this->timings.back()->gpu_end;
-std::cout << "\nSTART FRAME READOUT\n";
-std::cout << "error bound: " << this->device_context.clock.error_bound
- << '\n';
-std::cout << "num frames in flight: " << S << '\n';
-std::cout << "from oldest -> newest\n";
+ // assert(last_b <= a);
-// const auto b_seq = semaphore_from_context(*this);
-const auto now = std::chrono::steady_clock::now();
+ const auto frametime = *b - last_b;
-auto i = std::size_t{0};
-for (; i < std::size(this->in_flight_frames); ++i) {
- const auto& frame = this->in_flight_frames[i];
- std::cout << " Evaluating the frame that's " << S - i - 1
- << " behind\n";
- if (!frame) {
- std::cout << " nullptr!\n";
- continue;
- }
+ std::cerr
+ << " calculated total time from last frame (frametime): ";
+ debug_log_time(*b - last_b);
- std::cout << " target start: " << frame->target_start_sequence <<
-'\n'; std::cout << " target end: " << frame->target_end_sequence << '\n'; if
-(seq < frame->target_start_sequence) { std::cout << " frame hasn't
-started yet!\n"; continue;
- }
+ this->timings.emplace_back(std::make_unique<Timing>(Timing{
+ .gpu_start = *a,
+ .gpu_end = *b,
+ .frametime = frametime,
+ }));
- const auto start_ticks =
- frame->start_context.timestamp_pool->get_polled(*frame->start);
- std::cout << " START TICKS: " << start_ticks << '\n';
- const auto& a_clock = frame->start_context.device_context.clock;
- const auto a = a_clock.ticks_to_time(start_ticks);
-
- {
- using namespace std::chrono;
- const auto diff = now - a;
- const auto ms = duration_cast<milliseconds>(diff);
- const auto us = duration_cast<microseconds>(diff - ms);
- const auto ns = duration_cast<nanoseconds>(diff - ms - us);
- std::cout << " frame started: " << ms << " ms " << us
- << " us " << ns << " ns ago\n";
- }
-
- if (seq < frame->target_end_sequence) {
- std::cout << " frame hasn't ended yet!\n";
- continue;
- }
-
-
- const auto end_ticks =
- frame->end_context.timestamp_pool->get_polled(*frame->end, true);
- const auto& b_clock = frame->end_context.device_context.clock;
- std::cout << " END_TICKS: " << end_ticks << '\n';
- const auto b = b_clock.ticks_to_time(end_ticks);
- {
- using namespace std::chrono;
- if (now <= b) {
- std::cout << "b happened before now?\n";
- }
- const auto diff = now - b;
- const auto ms = duration_cast<milliseconds>(diff);
- const auto us = duration_cast<microseconds>(diff - ms);
- const auto ns = duration_cast<nanoseconds>(diff - ms - us);
- std::cout << " frame ended: " << ms << " ms " << us
- << " us " << ns << " ns ago\n";
+ this->in_flight_frames.pop_front();
}
- const auto gpu_time = b - a;
- {
- using namespace std::chrono;
- const auto diff = gpu_time;
- const auto ms = duration_cast<milliseconds>(diff);
- const auto us = duration_cast<microseconds>(diff - ms);
- const auto ns = duration_cast<nanoseconds>(diff - ms - us);
- std::cout << " gpu_time: " << ms << " ms " << us
- << " us " << ns << " ns ago\n";
+ const auto MAX_TRACKED = 50;
+ if (std::size(this->timings) < MAX_TRACKED) {
+ return;
}
-
- /*
- cpu_frametimes.emplace_back(cpu_time);
- gpu_frametimes.emplace_back(gpu_time);
-}
-
-/*
-if (remove_index.has_value()) {
- this->in_flight_frames.erase(std::begin(this->in_flight_frames),
- std::begin(this->in_flight_frames) +
- *remove_index);
+ this->timings.erase(std::begin(this->timings),
+ std::next(std::begin(this->timings),
+ std::size(this->timings) - MAX_TRACKED));
}
-*/
-
-/*
-auto g_copy = gpu_frametimes;
-auto c_copy = cpu_frametimes;
-std::ranges::sort(g_copy);
-std::ranges::sort(c_copy);
-
-constexpr auto N = 49;
-if (std::size(cpu_frametimes) < N) {
- return std::nullopt;
-}
-
-const auto F = std::size(g_copy);
-// close enough to median lol
-const auto g = g_copy[F / 2];
-const auto c = c_copy[F / 2];
-std::cout << g << '\n';
+using opt_time_point_t = std::optional<DeviceContext::Clock::time_point_t>;
+opt_time_point_t QueueContext::get_sleep_until() {
-std::cout << " median gpu: " << (g / 1'000'000) << " ms " << g / 1'000
- << " us " << g << " ns\n";
-std::cout << " median cpu: " << c / 1'000'000 << " ms " << c / 1'000
- << " us " << c << " ns\n";
+ // Call this to push all in flight frames into our timings structure,
+ // but only if they're completed. So now they are truly *in flight frames*.
+ this->process_frames();
+
+ // We have completed all frames. DO NOT WAIT!
+ if (!std::size(this->in_flight_frames)) {
+ return std::nullopt;
+ }
-if (F > N) {
- gpu_frametimes.pop_front();
- cpu_frametimes.pop_front();
-}
+ const auto median_frametime = [&, this]() {
+ auto vect = std::vector<Timing*>{};
+ std::ranges::transform(this->timings, std::back_inserter(vect),
+ [](const auto& timing) { return timing.get(); });
+ std::ranges::sort(vect, [](const auto& a, const auto& b) {
+ return a->frametime < b->frametime;
+ });
+ return vect[std::size(vect) / 2]->frametime;
+ }();
-return std::nullopt;
+ // PRESENT CALL
+ // | -------x----- | -------x--------------|
+ // ^ last_b ^ a ^ b
+ //
+ // Us, the CPU on the host, is approximately at 'b'.
+ // We have a good guess for the distance between
+ // last_b and b (median_frametime).
+ // The GPU is at any point on this line (marked as x).
+ // Don't use A. It's less robust than just using last_b.
+ // It *might* be more accurate because it's closer,
+ // but there's an issue where there can sometimes be a very
+ // small distance between a and b because it is just the
+ // point in time when the vkAcquireSwapchainKHR signals
+ // the wait on the gpu queue, which can sometimes be tiny.
+
+ std::cerr << " median 100 frametimes: ";
+ debug_log_time(median_frametime);
+
+ // 2% of average gpu time for dealing with variance.
+ // This could be calculated more precisely with the
+ // numbers we have (like we could construct a high% confidence
+ // interval? not big on maths).
+ const auto slack = median_frametime / 50;
+
+ // If we're more than 1 frame queued, then we should wait for
+ // that to complete before returning. It's likely way better to
+ // to sleep twice here and recompute between sleeps because we're
+ // extrapolating really far into the future here! TODO
+ const auto extra_delay =
+ median_frametime * (std::size(this->in_flight_frames) - 1);
+
+ const auto& last_b = this->timings.back()->gpu_end;
+
+ // All educated guesses:
+ // dist_to_b = frametime - dist_to_last_b;
+ // dist_to_last_b = now - last_b
+ // sleep_until = now + extra_delay + slack + dist_to_b
+ // = now + extra_delay + slack + (frametime - dist_to_last_b)
+ // = now + extra_delay + slack + frametime - (now - last_b)
+
+ const auto now = std::chrono::steady_clock::now();
+ assert(last_b <= now);
+ const auto dist = now - last_b;
+ // Even if this is negative, it's a no-op to sleep backwards.
+ return now + extra_delay + slack + median_frametime - dist;
}
-*/
} // namespace low_latency \ No newline at end of file
diff --git a/src/queue_context.hh b/src/queue_context.hh
index 3df6af4..6a71754 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -2,6 +2,7 @@
#define QUEUE_STATE_HH_
#include "context.hh"
+#include "device_context.hh"
#include "timestamp_pool.hh"
#include <vulkan/utility/vk_dispatch_table.h>
@@ -14,8 +15,6 @@
namespace low_latency {
-class DeviceContext;
-
class QueueContext final : public Context {
public:
DeviceContext& device_context;
@@ -23,6 +22,8 @@ class QueueContext final : public Context {
const VkQueue queue;
const std::uint32_t queue_family_index;
+ // I used to use these to signal when we could read timestamps until
+ // I realised you could use hostQueryReset.
std::uint64_t semaphore_sequence = 0;
VkSemaphore semaphore;
@@ -30,14 +31,17 @@ class QueueContext final : public Context {
std::unique_ptr<TimestampPool> timestamp_pool;
+ private:
+ static constexpr auto MAX_TRACKED_TIMINGS = 50;
// Potentially in flight queue submissions
struct Submission {
const std::unordered_set<VkSemaphore> signals;
const std::unordered_set<VkSemaphore> waits;
- const std::uint64_t sequence;
const std::shared_ptr<TimestampPool::Handle> start_handle;
const std::shared_ptr<TimestampPool::Handle> end_handle;
+
+ bool end_of_frame_marker = false;
};
std::deque<std::shared_ptr<Submission>> submissions;
@@ -48,7 +52,6 @@ class QueueContext final : public Context {
struct Timepoint {
const QueueContext& context;
const std::shared_ptr<TimestampPool::Handle> handle;
- const std::uint64_t sequence;
};
const Timepoint start;
@@ -56,6 +59,20 @@ class QueueContext final : public Context {
};
std::deque<std::unique_ptr<Frame>> in_flight_frames;
+ struct Timing {
+
+ DeviceContext::Clock::time_point_t gpu_start;
+ DeviceContext::Clock::time_point_t gpu_end;
+
+ // Distance between the last gpu_end and this one.
+ // So one entire go around, including all cpu and gpu.
+ DeviceContext::Clock::time_point_t::duration frametime;
+ };
+ std::deque<std::unique_ptr<Timing>> timings;
+
+ private:
+ void process_frames();
+
public:
QueueContext(DeviceContext& device_context, const VkQueue& queue,
const std::uint32_t& queue_family_index);
@@ -63,18 +80,19 @@ class QueueContext final : public Context {
public:
void
- notify_submit(const VkSubmitInfo& info, const std::uint64_t& sequence,
+ notify_submit(const VkSubmitInfo& info,
const std::shared_ptr<TimestampPool::Handle> head_handle,
const std::shared_ptr<TimestampPool::Handle> tail_handle);
- // TODO submit2
+ void
+ notify_submit(const VkSubmitInfo2& info,
+ const std::shared_ptr<TimestampPool::Handle> head_handle,
+ const std::shared_ptr<TimestampPool::Handle> tail_handle);
void notify_present(const VkPresentInfoKHR& info);
public:
- // Computes the amount we should delay...
- using duration_t = std::chrono::steady_clock::duration;
- std::optional<duration_t> get_delay_time();
+ std::optional<DeviceContext::Clock::time_point_t> get_sleep_until();
};
}; // namespace low_latency