diff options
Diffstat (limited to 'src/queue_context.cc')
| -rw-r--r-- | src/queue_context.cc | 348 |
1 files changed, 256 insertions, 92 deletions
diff --git a/src/queue_context.cc b/src/queue_context.cc index 2b79b53..342f393 100644 --- a/src/queue_context.cc +++ b/src/queue_context.cc @@ -5,6 +5,7 @@ #include <algorithm> #include <chrono> #include <iostream> +#include <limits> #include <span> namespace low_latency { @@ -69,7 +70,7 @@ QueueContext::~QueueContext() { } void QueueContext::notify_submit( - const VkSubmitInfo& info, + const VkSubmitInfo& info, const std::uint64_t& sequence, const std::shared_ptr<TimestampPool::Handle> head_handle, const std::shared_ptr<TimestampPool::Handle> tail_handle) { @@ -80,8 +81,19 @@ void QueueContext::notify_submit( std::ranges::copy_n(info.pSignalSemaphores, info.signalSemaphoreCount, std::inserter(signals, std::end(signals))); - this->submissions.emplace_back(std::make_unique<Submission>( - std::move(signals), std::move(waits), head_handle, tail_handle)); + std::cerr << "submit1 notif for queue " << this->queue << '\n'; + std::cerr << " signals: \n"; + for (const auto& signal : signals) { + std::cerr << " " << signal << '\n'; + } + std::cerr << " waits: \n"; + for (const auto& wait : waits) { + std::cerr << " " << wait << '\n'; + } + + this->submissions.emplace_back( + std::make_unique<Submission>(std::move(signals), std::move(waits), + head_handle, tail_handle, sequence)); // TODO HACK if (std::size(this->submissions) > 100) { @@ -90,7 +102,7 @@ void QueueContext::notify_submit( } void QueueContext::notify_submit( - const VkSubmitInfo2& info, + const VkSubmitInfo2& info, const std::uint64_t& sequence, const std::shared_ptr<TimestampPool::Handle> head_handle, const std::shared_ptr<TimestampPool::Handle> tail_handle) { @@ -117,8 +129,9 @@ void QueueContext::notify_submit( std::cerr << " " << wait << '\n'; } - this->submissions.emplace_back(std::make_unique<Submission>( - std::move(signals), std::move(waits), head_handle, tail_handle)); + this->submissions.emplace_back( + std::make_unique<Submission>(std::move(signals), std::move(waits), + head_handle, tail_handle, sequence)); // TODO HACK if (std::size(this->submissions) > 100) { @@ -170,19 +183,28 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) { return; } const auto& acquire = *acquire_iter; + (*acquire_iter)->debug += "acquire "; const auto present_iter = std::ranges::find_if( std::rbegin(this->submissions), std::rend(this->submissions), [&](const auto& submission) { return std::ranges::any_of( - submission->signals, - [&](const auto& signal) { return waits.contains(signal); }); + submission->signals, [&](const auto& signal) { + if (waits.contains(signal)) { + std::cerr + << "queue with signal matching present iter found: " + << signal << '\n'; + } + + return waits.contains(signal); + }); }); if (present_iter == std::rend(this->submissions)) { std::cerr << "couldn't find ending submission!\n"; return; } + (*present_iter)->debug += "present "; const auto& end = *present_iter; std::cerr << "present for queue: " << queue << ", our waits:\n"; @@ -197,24 +219,39 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) { // Now we read backwards to try to find our true start, starting at our // acquire. - const auto start_iter = std::prev(std::ranges::find_if( + const auto after_previous_present_iter = std::prev(std::ranges::find_if( std::next(acquire_iter), std::rend(this->submissions), [](const auto& submission) { return submission->end_of_frame_marker; })); - const auto& start = *start_iter; - - // start iter can't be end cause it's prev'd. - - auto frame = Frame{.start = - Frame::Timepoint{ - .context = *this, - .handle = start->start_handle, - }, - .end = Frame::Timepoint{ - .context = *this, - .handle = end->end_handle, - }}; + + (*after_previous_present_iter)->debug += "after_previous_present "; + + this->submissions.back()->debug += "last_in_host_present "; + + std::cerr << "the present iter was dist from back of: " + << std::distance(std::rbegin(this->submissions), present_iter) + << '\n'; + std::cerr << "the acquire iter was dist from back of: " + << std::distance(std::rbegin(this->submissions), acquire_iter) + << '\n'; + std::cerr << "the before_previous_present iter was dist from back of: " + << std::distance(std::rbegin(this->submissions), + after_previous_present_iter) + << '\n'; + + auto frame = + Frame{.start = + Frame::Timepoint{ + .context = *this, + .handle = (*after_previous_present_iter)->start_handle, + .sequence = (*after_previous_present_iter)->sequence, + }, + .end = Frame::Timepoint{ + .context = *this, + .handle = (*present_iter)->end_handle, + .sequence = (*present_iter)->sequence, + }}; this->in_flight_frames.emplace_back( std::make_unique<Frame>(std::move(frame))); } @@ -224,7 +261,7 @@ const auto debug_log_time = [](const auto& diff) { const auto ms = duration_cast<milliseconds>(diff); const auto us = duration_cast<microseconds>(diff - ms); const auto ns = duration_cast<nanoseconds>(diff - ms - us); - std::cerr << ms << " " << us << " " << ns << "\n"; + std::cerr << ms << " " << us << " " << ns << " ago\n"; }; void QueueContext::process_frames() { @@ -233,7 +270,7 @@ void QueueContext::process_frames() { } // Collect all devices and call calibrate. - [this]() -> auto { + [this]() -> void { using context_ref_t = std::reference_wrapper<DeviceContext>; auto target_devices = std::unordered_map<VkDevice, context_ref_t>{}; for (const auto& frame : this->in_flight_frames) { @@ -277,105 +314,232 @@ void QueueContext::process_frames() { break; } - // assert(a <= b); - - // - const auto last_b = - this->timings.empty() ? *a : this->timings.back()->gpu_end; + // We are committed to removing the frame at this stage and promoting it + // to a 'timing' struct because it's completed.. - // assert(last_b <= a); - - const auto frametime = *b - last_b; + const auto frametime = *b - *a; std::cerr << " calculated total time from last frame (frametime): "; - debug_log_time(*b - last_b); - - this->timings.emplace_back(std::make_unique<Timing>(Timing{ - .gpu_start = *a, - .gpu_end = *b, - .frametime = frametime, - })); + debug_log_time(*b - *a); + this->timings.emplace_back(std::make_unique<Timing>( + Timing{.gpu_start = *a, + .gpu_end = *b, + .gpu_time = frametime, + .frame = std::move(this->in_flight_frames.front())})); this->in_flight_frames.pop_front(); } - const auto MAX_TRACKED = 50; - if (std::size(this->timings) < MAX_TRACKED) { - return; + if (const auto T = std::size(this->timings); + T > this->MAX_TRACKED_TIMINGS) { + + const auto dist = T - this->MAX_TRACKED_TIMINGS; + const auto erase_to_iter = std::next(std::begin(this->timings), dist); + this->timings.erase(std::begin(this->timings), erase_to_iter); } - this->timings.erase(std::begin(this->timings), - std::next(std::begin(this->timings), - std::size(this->timings) - MAX_TRACKED)); } using opt_time_point_t = std::optional<DeviceContext::Clock::time_point_t>; -opt_time_point_t QueueContext::get_sleep_until() { +void QueueContext::sleep_in_present() { + const auto& device = this->device_context; + const auto& vtable = device.vtable; // Call this to push all in flight frames into our timings structure, // but only if they're completed. So now they are truly *in flight frames*. this->process_frames(); - - // We have completed all frames. DO NOT WAIT! + + if (const auto F = std::size(this->in_flight_frames); F > 1) { + // In this case, we are so far ahead that there are multiple frames in + // flight. Either that, or our bookkeeping has gone horribly wrong! Wait + // on the 2nd last frame in flight to complete. This shunts us to F=1. + const auto second_iter = std::next(std::rbegin(this->in_flight_frames)); + assert(second_iter != std::rend(this->in_flight_frames)); + + const auto& frame = (*second_iter)->end.sequence; + const auto swi = VkSemaphoreWaitInfo{ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO, + .semaphoreCount = 1, + .pSemaphores = &this->semaphore, + .pValues = &(*second_iter)->end.sequence, + }; + vtable.WaitSemaphoresKHR(device.device, &swi, + std::numeric_limits<std::uint64_t>::max()); + + // Here + this->process_frames(); // get rid of completed frames + } else if (!F) { + // We have completed all frames. DO NOT WAIT! + return; + } + if (!std::size(this->in_flight_frames)) { - return std::nullopt; + return; + } + assert(std::size(this->in_flight_frames) == 1); + + // Not enough data yet to apply any delays. + if (std::size(this->timings) < this->MAX_TRACKED_TIMINGS) { + return; } - const auto median_frametime = [&, this]() { + const auto expected_gputime = [&, this]() { auto vect = std::vector<Timing*>{}; std::ranges::transform(this->timings, std::back_inserter(vect), [](const auto& timing) { return timing.get(); }); std::ranges::sort(vect, [](const auto& a, const auto& b) { - return a->frametime < b->frametime; + return a->gpu_time < b->gpu_time; }); - return vect[std::size(vect) / 2]->frametime; + // return vect[0]->frametime; + return vect[std::size(vect) / 2]->gpu_time; }(); + std::cerr << " expected gputime: "; + debug_log_time(expected_gputime); - // PRESENT CALL - // | -------x----- | -------x--------------| - // ^ last_b ^ a ^ b + // PRESENT CALL + // |-------------------------|---------------------------| + // ^a ^swap_acquire ^b // // Us, the CPU on the host, is approximately at 'b'. // We have a good guess for the distance between - // last_b and b (median_frametime). - // The GPU is at any point on this line (marked as x). - // Don't use A. It's less robust than just using last_b. - // It *might* be more accurate because it's closer, - // but there's an issue where there can sometimes be a very - // small distance between a and b because it is just the - // point in time when the vkAcquireSwapchainKHR signals - // the wait on the gpu queue, which can sometimes be tiny. - - std::cerr << " median 100 frametimes: "; - debug_log_time(median_frametime); - - // 2% of average gpu time for dealing with variance. - // This could be calculated more precisely with the - // numbers we have (like we could construct a high% confidence - // interval? not big on maths). - const auto slack = median_frametime / 50; - - // If we're more than 1 frame queued, then we should wait for - // that to complete before returning. It's likely way better to - // to sleep twice here and recompute between sleeps because we're - // extrapolating really far into the future here! TODO - const auto extra_delay = - median_frametime * (std::size(this->in_flight_frames) - 1); - - const auto& last_b = this->timings.back()->gpu_end; - - // All educated guesses: - // dist_to_b = frametime - dist_to_last_b; - // dist_to_last_b = now - last_b - // sleep_until = now + extra_delay + slack + dist_to_b - // = now + extra_delay + slack + (frametime - dist_to_last_b) - // = now + extra_delay + slack + frametime - (now - last_b) + // a and b (median gputime). The GPU is at any point on this line. + + const auto& frame = this->in_flight_frames.back(); + + // We could be in the period where A hasn't signalled yet. + // It's impossible to make a decision until we know a. + // Doing this is fine because it won't affect throughput at all. + // (ie, there's more work queued after regardless). + [&]() -> void { + const auto swi = VkSemaphoreWaitInfo{ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO, + .semaphoreCount = 1, + .pSemaphores = &this->semaphore, + .pValues = &frame->start.sequence, + }; + vtable.WaitSemaphoresKHR(device.device, &swi, + std::numeric_limits<std::uint64_t>::max()); + }(); + + // We now know that A is available because its semaphore has been signalled. + const auto a_handle = frame->start.handle; + const auto& a_context = frame->start.context; + + const auto a_ticks_opt = a_handle->get_ticks(*a_context.timestamp_pool); + assert(a_ticks_opt.has_value()); + const auto& a_clock = a_context.device_context.clock; + const auto a = a_clock.ticks_to_time(*a_ticks_opt); const auto now = std::chrono::steady_clock::now(); - assert(last_b <= now); - const auto dist = now - last_b; - // Even if this is negative, it's a no-op to sleep backwards. - return now + extra_delay + slack + median_frametime - dist; + const auto dist = now - a; + const auto expected = expected_gputime - dist; + + const auto swi = VkSemaphoreWaitInfo{ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO, + .semaphoreCount = 1, + .pSemaphores = &this->semaphore, + .pValues = &frame->end.sequence, + }; + vtable.WaitSemaphoresKHR(device.device, &swi, std::max(expected.count(), 0l)); + + /* + + // 2% of average gpu time for dealing with variance. + // This could be calculated more precisely with the + // numbers we have (like we could construct a high% confidence + // interval? not big on maths). + const auto slack = gputime / 50; + + // All educated guesses: + // dist_to_b = gputime - dist_to_last_b; + // dist_to_last_b = now - last_b + // sleep_until = now + extra_delay + slack + dist_to_b + // = now + extra_delay + slack + (frametime - + dist_to_last_b) + // = now + extra_delay + slack + frametime - (now - last_b) + + const auto now = std::chrono::steady_clock::now(); + assert(last_b <= now); + const auto dist = now - last_b; + // Even if this is negative, it's a no-op to sleep backwards. + const auto sleep_target = + now + extra_delay + slack + median_frametime - dist; + */ + + /* + std::cerr << " SLEEPING FOR: "; + debug_log_time(sleep_target - now); + std::this_thread::sleep_until(sleep_target); + */ + + /* + const auto get_tick_time = [](const auto& timepoint) + -> std::optional<DeviceContext::Clock::time_point_t> { + const auto& handle = timepoint.handle; + const auto& context = timepoint.context; + + const auto ticks = handle->get_ticks(*context.timestamp_pool); + if (!ticks.has_value()) { + return std::nullopt; + } + const auto& clock = context.device_context.clock; + return clock.ticks_to_time(*ticks); + }; + + const auto finished_time = + get_tick_time(this->in_flight_frames.back()->end); + const auto after = std::chrono::steady_clock::now(); + if (!finished_time.has_value()) { + std::cerr << "didnt finish late!\n"; + } else { + std::cerr << "finished late by: "; + const auto late_time = after - *finished_time; + } + + const auto get_tick_time2 = [this](const auto& handle) + -> std::optional<DeviceContext::Clock::time_point_t> { + const auto& context = *this; + + const auto ticks = handle->get_ticks(*context.timestamp_pool); + if (!ticks.has_value()) { + return std::nullopt; + } + const auto& clock = context.device_context.clock; + return clock.ticks_to_time(*ticks); + }; + + this->submissions.back()->debug += " last_in_wait"; + + std::cerr << "---------------SUBMISSION READOUT POST FRAME SUBMIT AND " + "WAIT--------------------\n"; + auto i = std::size_t{0}; + for (auto it = std::rbegin(this->submissions); + i < 20 && it != std::rend(this->submissions); ++it, ++i) { + + std::cerr << " submission -" << i << '\n'; + const auto& submission = **it; + + std::cerr << " sequence target: " << submission.sequence << '\n'; + + std::cerr << " debug tags: " << submission.debug << '\n'; + + const auto end_time = get_tick_time2(submission.end_handle); + std::cerr << " end_time: "; + if (end_time.has_value()) { + debug_log_time(after - *end_time); + } else { + std::cerr << "not finished yet!\n"; + } + + const auto start_time = get_tick_time2(submission.start_handle); + std::cerr << " start_time: "; + if (start_time.has_value()) { + debug_log_time(after - *start_time); + } else { + std::cerr << "not finished yet!\n"; + } + } + */ } } // namespace low_latency
\ No newline at end of file |
