aboutsummaryrefslogtreecommitdiff
path: root/src/queue_context.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/queue_context.cc')
-rw-r--r--src/queue_context.cc348
1 files changed, 256 insertions, 92 deletions
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 2b79b53..342f393 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -5,6 +5,7 @@
#include <algorithm>
#include <chrono>
#include <iostream>
+#include <limits>
#include <span>
namespace low_latency {
@@ -69,7 +70,7 @@ QueueContext::~QueueContext() {
}
void QueueContext::notify_submit(
- const VkSubmitInfo& info,
+ const VkSubmitInfo& info, const std::uint64_t& sequence,
const std::shared_ptr<TimestampPool::Handle> head_handle,
const std::shared_ptr<TimestampPool::Handle> tail_handle) {
@@ -80,8 +81,19 @@ void QueueContext::notify_submit(
std::ranges::copy_n(info.pSignalSemaphores, info.signalSemaphoreCount,
std::inserter(signals, std::end(signals)));
- this->submissions.emplace_back(std::make_unique<Submission>(
- std::move(signals), std::move(waits), head_handle, tail_handle));
+ std::cerr << "submit1 notif for queue " << this->queue << '\n';
+ std::cerr << " signals: \n";
+ for (const auto& signal : signals) {
+ std::cerr << " " << signal << '\n';
+ }
+ std::cerr << " waits: \n";
+ for (const auto& wait : waits) {
+ std::cerr << " " << wait << '\n';
+ }
+
+ this->submissions.emplace_back(
+ std::make_unique<Submission>(std::move(signals), std::move(waits),
+ head_handle, tail_handle, sequence));
// TODO HACK
if (std::size(this->submissions) > 100) {
@@ -90,7 +102,7 @@ void QueueContext::notify_submit(
}
void QueueContext::notify_submit(
- const VkSubmitInfo2& info,
+ const VkSubmitInfo2& info, const std::uint64_t& sequence,
const std::shared_ptr<TimestampPool::Handle> head_handle,
const std::shared_ptr<TimestampPool::Handle> tail_handle) {
@@ -117,8 +129,9 @@ void QueueContext::notify_submit(
std::cerr << " " << wait << '\n';
}
- this->submissions.emplace_back(std::make_unique<Submission>(
- std::move(signals), std::move(waits), head_handle, tail_handle));
+ this->submissions.emplace_back(
+ std::make_unique<Submission>(std::move(signals), std::move(waits),
+ head_handle, tail_handle, sequence));
// TODO HACK
if (std::size(this->submissions) > 100) {
@@ -170,19 +183,28 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
return;
}
const auto& acquire = *acquire_iter;
+ (*acquire_iter)->debug += "acquire ";
const auto present_iter = std::ranges::find_if(
std::rbegin(this->submissions), std::rend(this->submissions),
[&](const auto& submission) {
return std::ranges::any_of(
- submission->signals,
- [&](const auto& signal) { return waits.contains(signal); });
+ submission->signals, [&](const auto& signal) {
+ if (waits.contains(signal)) {
+ std::cerr
+ << "queue with signal matching present iter found: "
+ << signal << '\n';
+ }
+
+ return waits.contains(signal);
+ });
});
if (present_iter == std::rend(this->submissions)) {
std::cerr << "couldn't find ending submission!\n";
return;
}
+ (*present_iter)->debug += "present ";
const auto& end = *present_iter;
std::cerr << "present for queue: " << queue << ", our waits:\n";
@@ -197,24 +219,39 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
// Now we read backwards to try to find our true start, starting at our
// acquire.
- const auto start_iter = std::prev(std::ranges::find_if(
+ const auto after_previous_present_iter = std::prev(std::ranges::find_if(
std::next(acquire_iter), std::rend(this->submissions),
[](const auto& submission) {
return submission->end_of_frame_marker;
}));
- const auto& start = *start_iter;
-
- // start iter can't be end cause it's prev'd.
-
- auto frame = Frame{.start =
- Frame::Timepoint{
- .context = *this,
- .handle = start->start_handle,
- },
- .end = Frame::Timepoint{
- .context = *this,
- .handle = end->end_handle,
- }};
+
+ (*after_previous_present_iter)->debug += "after_previous_present ";
+
+ this->submissions.back()->debug += "last_in_host_present ";
+
+ std::cerr << "the present iter was dist from back of: "
+ << std::distance(std::rbegin(this->submissions), present_iter)
+ << '\n';
+ std::cerr << "the acquire iter was dist from back of: "
+ << std::distance(std::rbegin(this->submissions), acquire_iter)
+ << '\n';
+ std::cerr << "the before_previous_present iter was dist from back of: "
+ << std::distance(std::rbegin(this->submissions),
+ after_previous_present_iter)
+ << '\n';
+
+ auto frame =
+ Frame{.start =
+ Frame::Timepoint{
+ .context = *this,
+ .handle = (*after_previous_present_iter)->start_handle,
+ .sequence = (*after_previous_present_iter)->sequence,
+ },
+ .end = Frame::Timepoint{
+ .context = *this,
+ .handle = (*present_iter)->end_handle,
+ .sequence = (*present_iter)->sequence,
+ }};
this->in_flight_frames.emplace_back(
std::make_unique<Frame>(std::move(frame)));
}
@@ -224,7 +261,7 @@ const auto debug_log_time = [](const auto& diff) {
const auto ms = duration_cast<milliseconds>(diff);
const auto us = duration_cast<microseconds>(diff - ms);
const auto ns = duration_cast<nanoseconds>(diff - ms - us);
- std::cerr << ms << " " << us << " " << ns << "\n";
+ std::cerr << ms << " " << us << " " << ns << " ago\n";
};
void QueueContext::process_frames() {
@@ -233,7 +270,7 @@ void QueueContext::process_frames() {
}
// Collect all devices and call calibrate.
- [this]() -> auto {
+ [this]() -> void {
using context_ref_t = std::reference_wrapper<DeviceContext>;
auto target_devices = std::unordered_map<VkDevice, context_ref_t>{};
for (const auto& frame : this->in_flight_frames) {
@@ -277,105 +314,232 @@ void QueueContext::process_frames() {
break;
}
- // assert(a <= b);
-
- //
- const auto last_b =
- this->timings.empty() ? *a : this->timings.back()->gpu_end;
+ // We are committed to removing the frame at this stage and promoting it
+ // to a 'timing' struct because it's completed..
- // assert(last_b <= a);
-
- const auto frametime = *b - last_b;
+ const auto frametime = *b - *a;
std::cerr
<< " calculated total time from last frame (frametime): ";
- debug_log_time(*b - last_b);
-
- this->timings.emplace_back(std::make_unique<Timing>(Timing{
- .gpu_start = *a,
- .gpu_end = *b,
- .frametime = frametime,
- }));
+ debug_log_time(*b - *a);
+ this->timings.emplace_back(std::make_unique<Timing>(
+ Timing{.gpu_start = *a,
+ .gpu_end = *b,
+ .gpu_time = frametime,
+ .frame = std::move(this->in_flight_frames.front())}));
this->in_flight_frames.pop_front();
}
- const auto MAX_TRACKED = 50;
- if (std::size(this->timings) < MAX_TRACKED) {
- return;
+ if (const auto T = std::size(this->timings);
+ T > this->MAX_TRACKED_TIMINGS) {
+
+ const auto dist = T - this->MAX_TRACKED_TIMINGS;
+ const auto erase_to_iter = std::next(std::begin(this->timings), dist);
+ this->timings.erase(std::begin(this->timings), erase_to_iter);
}
- this->timings.erase(std::begin(this->timings),
- std::next(std::begin(this->timings),
- std::size(this->timings) - MAX_TRACKED));
}
using opt_time_point_t = std::optional<DeviceContext::Clock::time_point_t>;
-opt_time_point_t QueueContext::get_sleep_until() {
+void QueueContext::sleep_in_present() {
+ const auto& device = this->device_context;
+ const auto& vtable = device.vtable;
// Call this to push all in flight frames into our timings structure,
// but only if they're completed. So now they are truly *in flight frames*.
this->process_frames();
-
- // We have completed all frames. DO NOT WAIT!
+
+ if (const auto F = std::size(this->in_flight_frames); F > 1) {
+ // In this case, we are so far ahead that there are multiple frames in
+ // flight. Either that, or our bookkeeping has gone horribly wrong! Wait
+ // on the 2nd last frame in flight to complete. This shunts us to F=1.
+ const auto second_iter = std::next(std::rbegin(this->in_flight_frames));
+ assert(second_iter != std::rend(this->in_flight_frames));
+
+ const auto& frame = (*second_iter)->end.sequence;
+ const auto swi = VkSemaphoreWaitInfo{
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+ .semaphoreCount = 1,
+ .pSemaphores = &this->semaphore,
+ .pValues = &(*second_iter)->end.sequence,
+ };
+ vtable.WaitSemaphoresKHR(device.device, &swi,
+ std::numeric_limits<std::uint64_t>::max());
+
+ // Here
+ this->process_frames(); // get rid of completed frames
+ } else if (!F) {
+ // We have completed all frames. DO NOT WAIT!
+ return;
+ }
+
if (!std::size(this->in_flight_frames)) {
- return std::nullopt;
+ return;
+ }
+ assert(std::size(this->in_flight_frames) == 1);
+
+ // Not enough data yet to apply any delays.
+ if (std::size(this->timings) < this->MAX_TRACKED_TIMINGS) {
+ return;
}
- const auto median_frametime = [&, this]() {
+ const auto expected_gputime = [&, this]() {
auto vect = std::vector<Timing*>{};
std::ranges::transform(this->timings, std::back_inserter(vect),
[](const auto& timing) { return timing.get(); });
std::ranges::sort(vect, [](const auto& a, const auto& b) {
- return a->frametime < b->frametime;
+ return a->gpu_time < b->gpu_time;
});
- return vect[std::size(vect) / 2]->frametime;
+ // return vect[0]->frametime;
+ return vect[std::size(vect) / 2]->gpu_time;
}();
+ std::cerr << " expected gputime: ";
+ debug_log_time(expected_gputime);
- // PRESENT CALL
- // | -------x----- | -------x--------------|
- // ^ last_b ^ a ^ b
+ // PRESENT CALL
+ // |-------------------------|---------------------------|
+ // ^a ^swap_acquire ^b
//
// Us, the CPU on the host, is approximately at 'b'.
// We have a good guess for the distance between
- // last_b and b (median_frametime).
- // The GPU is at any point on this line (marked as x).
- // Don't use A. It's less robust than just using last_b.
- // It *might* be more accurate because it's closer,
- // but there's an issue where there can sometimes be a very
- // small distance between a and b because it is just the
- // point in time when the vkAcquireSwapchainKHR signals
- // the wait on the gpu queue, which can sometimes be tiny.
-
- std::cerr << " median 100 frametimes: ";
- debug_log_time(median_frametime);
-
- // 2% of average gpu time for dealing with variance.
- // This could be calculated more precisely with the
- // numbers we have (like we could construct a high% confidence
- // interval? not big on maths).
- const auto slack = median_frametime / 50;
-
- // If we're more than 1 frame queued, then we should wait for
- // that to complete before returning. It's likely way better to
- // to sleep twice here and recompute between sleeps because we're
- // extrapolating really far into the future here! TODO
- const auto extra_delay =
- median_frametime * (std::size(this->in_flight_frames) - 1);
-
- const auto& last_b = this->timings.back()->gpu_end;
-
- // All educated guesses:
- // dist_to_b = frametime - dist_to_last_b;
- // dist_to_last_b = now - last_b
- // sleep_until = now + extra_delay + slack + dist_to_b
- // = now + extra_delay + slack + (frametime - dist_to_last_b)
- // = now + extra_delay + slack + frametime - (now - last_b)
+ // a and b (median gputime). The GPU is at any point on this line.
+
+ const auto& frame = this->in_flight_frames.back();
+
+ // We could be in the period where A hasn't signalled yet.
+ // It's impossible to make a decision until we know a.
+ // Doing this is fine because it won't affect throughput at all.
+ // (ie, there's more work queued after regardless).
+ [&]() -> void {
+ const auto swi = VkSemaphoreWaitInfo{
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+ .semaphoreCount = 1,
+ .pSemaphores = &this->semaphore,
+ .pValues = &frame->start.sequence,
+ };
+ vtable.WaitSemaphoresKHR(device.device, &swi,
+ std::numeric_limits<std::uint64_t>::max());
+ }();
+
+ // We now know that A is available because its semaphore has been signalled.
+ const auto a_handle = frame->start.handle;
+ const auto& a_context = frame->start.context;
+
+ const auto a_ticks_opt = a_handle->get_ticks(*a_context.timestamp_pool);
+ assert(a_ticks_opt.has_value());
+ const auto& a_clock = a_context.device_context.clock;
+ const auto a = a_clock.ticks_to_time(*a_ticks_opt);
const auto now = std::chrono::steady_clock::now();
- assert(last_b <= now);
- const auto dist = now - last_b;
- // Even if this is negative, it's a no-op to sleep backwards.
- return now + extra_delay + slack + median_frametime - dist;
+ const auto dist = now - a;
+ const auto expected = expected_gputime - dist;
+
+ const auto swi = VkSemaphoreWaitInfo{
+ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
+ .semaphoreCount = 1,
+ .pSemaphores = &this->semaphore,
+ .pValues = &frame->end.sequence,
+ };
+ vtable.WaitSemaphoresKHR(device.device, &swi, std::max(expected.count(), 0l));
+
+ /*
+
+ // 2% of average gpu time for dealing with variance.
+ // This could be calculated more precisely with the
+ // numbers we have (like we could construct a high% confidence
+ // interval? not big on maths).
+ const auto slack = gputime / 50;
+
+ // All educated guesses:
+ // dist_to_b = gputime - dist_to_last_b;
+ // dist_to_last_b = now - last_b
+ // sleep_until = now + extra_delay + slack + dist_to_b
+ // = now + extra_delay + slack + (frametime -
+ dist_to_last_b)
+ // = now + extra_delay + slack + frametime - (now - last_b)
+
+ const auto now = std::chrono::steady_clock::now();
+ assert(last_b <= now);
+ const auto dist = now - last_b;
+ // Even if this is negative, it's a no-op to sleep backwards.
+ const auto sleep_target =
+ now + extra_delay + slack + median_frametime - dist;
+ */
+
+ /*
+ std::cerr << " SLEEPING FOR: ";
+ debug_log_time(sleep_target - now);
+ std::this_thread::sleep_until(sleep_target);
+ */
+
+ /*
+ const auto get_tick_time = [](const auto& timepoint)
+ -> std::optional<DeviceContext::Clock::time_point_t> {
+ const auto& handle = timepoint.handle;
+ const auto& context = timepoint.context;
+
+ const auto ticks = handle->get_ticks(*context.timestamp_pool);
+ if (!ticks.has_value()) {
+ return std::nullopt;
+ }
+ const auto& clock = context.device_context.clock;
+ return clock.ticks_to_time(*ticks);
+ };
+
+ const auto finished_time =
+ get_tick_time(this->in_flight_frames.back()->end);
+ const auto after = std::chrono::steady_clock::now();
+ if (!finished_time.has_value()) {
+ std::cerr << "didnt finish late!\n";
+ } else {
+ std::cerr << "finished late by: ";
+ const auto late_time = after - *finished_time;
+ }
+
+ const auto get_tick_time2 = [this](const auto& handle)
+ -> std::optional<DeviceContext::Clock::time_point_t> {
+ const auto& context = *this;
+
+ const auto ticks = handle->get_ticks(*context.timestamp_pool);
+ if (!ticks.has_value()) {
+ return std::nullopt;
+ }
+ const auto& clock = context.device_context.clock;
+ return clock.ticks_to_time(*ticks);
+ };
+
+ this->submissions.back()->debug += " last_in_wait";
+
+ std::cerr << "---------------SUBMISSION READOUT POST FRAME SUBMIT AND "
+ "WAIT--------------------\n";
+ auto i = std::size_t{0};
+ for (auto it = std::rbegin(this->submissions);
+ i < 20 && it != std::rend(this->submissions); ++it, ++i) {
+
+ std::cerr << " submission -" << i << '\n';
+ const auto& submission = **it;
+
+ std::cerr << " sequence target: " << submission.sequence << '\n';
+
+ std::cerr << " debug tags: " << submission.debug << '\n';
+
+ const auto end_time = get_tick_time2(submission.end_handle);
+ std::cerr << " end_time: ";
+ if (end_time.has_value()) {
+ debug_log_time(after - *end_time);
+ } else {
+ std::cerr << "not finished yet!\n";
+ }
+
+ const auto start_time = get_tick_time2(submission.start_handle);
+ std::cerr << " start_time: ";
+ if (start_time.has_value()) {
+ debug_log_time(after - *start_time);
+ } else {
+ std::cerr << "not finished yet!\n";
+ }
+ }
+ */
}
} // namespace low_latency \ No newline at end of file