diff options
| author | Nicolas James <Eele1Ephe7uZahRie@tutanota.com> | 2026-02-14 14:05:26 +1100 |
|---|---|---|
| committer | Nicolas James <Eele1Ephe7uZahRie@tutanota.com> | 2026-02-14 14:05:26 +1100 |
| commit | 0deb469d5a7c9a16179139dcff74a54aac1791a0 (patch) | |
| tree | 2f791a72f0441a30321332f5ecea5865f357e2bb /src | |
| parent | 8f4501215c0dbbbde59da2d015fdec3dbe5131bc (diff) | |
commit wip
Diffstat (limited to 'src')
| -rw-r--r-- | src/device_context.cc | 46 | ||||
| -rw-r--r-- | src/device_context.hh | 11 | ||||
| -rw-r--r-- | src/layer.cc | 116 | ||||
| -rw-r--r-- | src/queue_context.cc | 429 | ||||
| -rw-r--r-- | src/queue_context.hh | 35 | ||||
| -rw-r--r-- | src/timestamp_pool.cc | 208 | ||||
| -rw-r--r-- | src/timestamp_pool.hh | 63 |
7 files changed, 527 insertions, 381 deletions
diff --git a/src/device_context.cc b/src/device_context.cc index 4b39210..f849df1 100644 --- a/src/device_context.cc +++ b/src/device_context.cc @@ -31,50 +31,58 @@ void DeviceContext::notify_acquire(const VkSwapchainKHR& swapchain, it->second.insert_or_assign(image_index, signal_semaphore); } -DeviceContext::Clock::Clock(const DeviceContext& context) { +DeviceContext::Clock::Clock(const DeviceContext& context) : device(context) { + this->calibrate(); +} + +DeviceContext::Clock::~Clock() {} +void DeviceContext::Clock::calibrate() { const auto infos = std::vector<VkCalibratedTimestampInfoKHR>{ {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_DEVICE_EXT}, {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr, VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT}}; - auto device_host = std::array<std::uint64_t, 2>{}; + struct CalibratedResult { + std::uint64_t device; + std::uint64_t host; + }; + auto calibrated_result = CalibratedResult{}; + // we probably want to use this instead bc clock_gettime isn't guaranteed + // by steady clock afaik + /* + struct timespec tv; + clock_gettime(CLOCK_MONOTONIC, &tv); + return tv.tv_nsec + tv.tv_sec*1000000000ull; + */ const auto steady_before = std::chrono::steady_clock::now(); - context.vtable.GetCalibratedTimestampsKHR( - context.device, 2, std::data(infos), std::data(device_host), - &this->error_bound); + device.vtable.GetCalibratedTimestampsKHR(device.device, 2, std::data(infos), + &calibrated_result.device, + &this->error_bound); const auto steady_after = std::chrono::steady_clock::now(); this->cpu_time = steady_before + (steady_after - steady_before) / 2; - this->device_ticks = device_host[0]; - this->host_ns = device_host[1]; + this->device_ticks = calibrated_result.device; + this->host_ns = calibrated_result.host; - // Might need to get physical limits again? - this->ticks_per_ns = - context.physical_device.properties->limits.timestampPeriod; + // Might need to get physical limits every now and then? + const auto& pd = device.physical_device.properties; + this->ticks_per_ns = pd->limits.timestampPeriod; } DeviceContext::Clock::time_point_t DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const { - /* - struct timespec tv; - clock_gettime(CLOCK_MONOTONIC, &tv); - return tv.tv_nsec + tv.tv_sec*1000000000ull; - */ - auto a = this->device_ticks; auto b = ticks; - const auto was_before = a > b; if (was_before) { // it's happened before std::swap(a, b); } + const auto nsec = std::chrono::nanoseconds((b - a) * this->ticks_per_ns); return this->cpu_time + (was_before ? -nsec : nsec); } -void DeviceContext::calibrate_timestamps() { this->clock = Clock{*this}; } - } // namespace low_latency
\ No newline at end of file diff --git a/src/device_context.hh b/src/device_context.hh index b55b70c..c08cec2 100644 --- a/src/device_context.hh +++ b/src/device_context.hh @@ -35,8 +35,11 @@ struct DeviceContext final : public Context { std::unordered_map<VkSwapchainKHR, index_semaphores_t> swapchain_signals; struct Clock { + public: using time_point_t = std::chrono::steady_clock::time_point; + const DeviceContext& device; + public: time_point_t cpu_time; std::uint64_t error_bound; std::uint64_t device_ticks; @@ -45,7 +48,10 @@ struct DeviceContext final : public Context { public: Clock(const DeviceContext& device); - + ~Clock(); + + public: + void calibrate(); time_point_t ticks_to_time(const std::uint64_t& ticks) const; }; Clock clock; @@ -61,9 +67,6 @@ struct DeviceContext final : public Context { void notify_acquire(const VkSwapchainKHR& swapchain, const std::uint32_t& image_index, const VkSemaphore& signal_semaphore); - - public: - void calibrate_timestamps(); }; }; // namespace low_latency diff --git a/src/layer.cc b/src/layer.cc index c521bb9..1b1d9e7 100644 --- a/src/layer.cc +++ b/src/layer.cc @@ -1,9 +1,12 @@ #include "layer.hh" +#include <memory> +#include <span> #include <string_view> #include <thread> #include <unordered_map> #include <utility> +#include <vector> #include <vulkan/utility/vk_dispatch_table.h> #include <vulkan/vk_layer.h> @@ -224,7 +227,8 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice( const auto wanted_extensions = { VK_KHR_SYNCHRONIZATION_2_EXTENSION_NAME, VK_KHR_CALIBRATED_TIMESTAMPS_EXTENSION_NAME, - VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME}; + VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME, + VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME}; for (const auto& wanted : wanted_extensions) { @@ -274,7 +278,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice( return result; } - + #define DEVICE_VTABLE_LOAD(name) \ .name = reinterpret_cast<PFN_vk##name>(gdpa(*pDevice, "vk" #name)) auto vtable = VkuDeviceDispatchTable{ @@ -294,9 +298,9 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice( DEVICE_VTABLE_LOAD(BeginCommandBuffer), DEVICE_VTABLE_LOAD(EndCommandBuffer), DEVICE_VTABLE_LOAD(ResetCommandBuffer), - DEVICE_VTABLE_LOAD(CmdResetQueryPool), DEVICE_VTABLE_LOAD(CmdDraw), DEVICE_VTABLE_LOAD(CmdDrawIndexed), + DEVICE_VTABLE_LOAD(CmdResetQueryPool), DEVICE_VTABLE_LOAD(GetDeviceQueue2), DEVICE_VTABLE_LOAD(QueueSubmit2), DEVICE_VTABLE_LOAD(AcquireNextImageKHR), @@ -306,6 +310,7 @@ static VKAPI_ATTR VkResult VKAPI_CALL CreateDevice( DEVICE_VTABLE_LOAD(CmdWriteTimestamp2KHR), DEVICE_VTABLE_LOAD(QueueSubmit2KHR), DEVICE_VTABLE_LOAD(GetCalibratedTimestampsKHR), + DEVICE_VTABLE_LOAD(ResetQueryPoolEXT), }; #undef DEVICE_VTABLE_LOAD @@ -442,61 +447,81 @@ static VKAPI_ATTR VkResult VKAPI_CALL vkAcquireNextImage2KHR( static VKAPI_ATTR VkResult VKAPI_CALL vkQueueSubmit(VkQueue queue, std::uint32_t submit_count, - const VkSubmitInfo* submit_info, VkFence fence) { + const VkSubmitInfo* submit_infos, VkFence fence) { const auto& queue_context = layer_context.get_context(queue); const auto& vtable = queue_context->device_context.vtable; if (!submit_count) { // no-op submit we shouldn't worry about - return vtable.QueueSubmit(queue, submit_count, submit_info, fence); + return vtable.QueueSubmit(queue, submit_count, submit_infos, fence); } - // Create a new vector of submit infos. - auto next_submit_infos = std::vector<VkSubmitInfo>{}; - - auto timestamp_handle = queue_context->timestamp_pool->acquire(); - timestamp_handle->setup_command_buffers(vtable); - - const auto& [head_cb, tail_cb] = timestamp_handle->command_buffers; + // We have to avoid casting away the const* of the passed VkSubmitInfos. + // We wrap every single submission with *two* extra VkSubmitInfos to + // accomplish this. The first executes a command buffer that + + using cb_vect = std::vector<VkCommandBuffer>; + using tssi_ptr_t = std::unique_ptr<VkTimelineSemaphoreSubmitInfo>; + auto next_submits = std::vector<VkSubmitInfo>{}; + auto next_cbs = std::vector<std::unique_ptr<cb_vect>>{}; + auto next_signals = std::vector<std::unique_ptr<std::uint64_t>>{}; + auto next_tssis = std::vector<tssi_ptr_t>{}; + auto handles = std::vector<std::shared_ptr<TimestampPool::Handle>>{}; + + for (const auto& submit_info : std::span{submit_infos, submit_count}) { + const auto head_handle = queue_context->timestamp_pool->acquire(); + const auto tail_handle = queue_context->timestamp_pool->acquire(); + + // Head is special as we need to inject a CB into a copy of + // their command buffers that records the time the waits completed. + next_cbs.emplace_back([&]() -> auto { + auto cbs = std::make_unique<std::vector<VkCommandBuffer>>(); + head_handle->setup_command_buffers(*tail_handle, *queue_context); + cbs->push_back(head_handle->command_buffer); + std::ranges::copy_n(submit_info.pCommandBuffers, + submit_info.commandBufferCount, + std::back_inserter(*cbs)); + cbs->push_back(tail_handle->command_buffer); + return cbs; + }()); + next_submits.push_back(submit_info); + next_submits.back().pCommandBuffers = std::data(*next_cbs.back()); + next_submits.back().commandBufferCount = std::size(*next_cbs.back()); + + const auto next_signal = 1 + queue_context->semaphore_sequence++; + + next_signals.push_back(std::make_unique<std::uint64_t>(next_signal)); + + next_tssis.push_back(std::make_unique<VkTimelineSemaphoreSubmitInfo>( + VkTimelineSemaphoreSubmitInfo{ + .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR, + .signalSemaphoreValueCount = 1, + .pSignalSemaphoreValues = next_signals.back().get(), + })); + next_submits.push_back(VkSubmitInfo{ + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .pNext = next_tssis.back().get(), + .commandBufferCount = 1, + .pCommandBuffers = &tail_handle->command_buffer, + .signalSemaphoreCount = 1, + .pSignalSemaphores = &queue_context->semaphore, + }); - const auto next_command_buffers = [&]() -> auto { - auto next_command_buffers = std::vector<VkCommandBuffer>{head_cb}; - std::ranges::copy_n(submit_info[0].pCommandBuffers, - submit_info[0].commandBufferCount, - std::back_inserter(next_command_buffers)); - return next_command_buffers; - }(); + queue_context->notify_submit(submit_info, next_signal, head_handle, + tail_handle); - std::ranges::copy_n(submit_info, submit_count, - std::back_inserter(next_submit_infos)); - next_submit_infos[0].pCommandBuffers = std::data(next_command_buffers); - next_submit_infos[0].commandBufferCount = std::size(next_command_buffers); - - const auto next_signal = 1 + queue_context->semaphore_sequence++; - const auto tail_tssi = VkTimelineSemaphoreSubmitInfo{ - .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR, - .signalSemaphoreValueCount = 1, - .pSignalSemaphoreValues = &next_signal, - }; - next_submit_infos.push_back(VkSubmitInfo{ - .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, - .pNext = &tail_tssi, - .commandBufferCount = 1, - .pCommandBuffers = &tail_cb, - .signalSemaphoreCount = 1, - .pSignalSemaphores = &queue_context->semaphore, - }); + handles.push_back(head_handle); + handles.push_back(tail_handle); + } - if (const auto res = - vtable.QueueSubmit(queue, std::size(next_submit_infos), - std::data(next_submit_infos), fence); + if (const auto res = vtable.QueueSubmit(queue, std::size(next_submits), + std::data(next_submits), fence); res != VK_SUCCESS) { return res; } - queue_context->notify_submit(std::span{submit_info, submit_count}, - next_signal, std::move(timestamp_handle)); + // ?!? return VK_SUCCESS; } @@ -509,10 +534,12 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count, const auto queue_context = layer_context.get_context(queue); const auto& vtable = queue_context->device_context.vtable; - if (!submit_count) { + // TODO + if (!submit_count || true) { return vtable.QueueSubmit2(queue, submit_count, submit_infos, fence); } + /* auto timestamp_handle = queue_context->timestamp_pool->acquire(); timestamp_handle->setup_command_buffers(vtable); const auto& [head_cb, tail_cb] = timestamp_handle->command_buffers; @@ -568,6 +595,7 @@ vkQueueSubmit2(VkQueue queue, std::uint32_t submit_count, std::move(timestamp_handle)); return VK_SUCCESS; + */ } static VKAPI_ATTR VkResult VKAPI_CALL diff --git a/src/queue_context.cc b/src/queue_context.cc index 9b46773..99cf51e 100644 --- a/src/queue_context.cc +++ b/src/queue_context.cc @@ -67,25 +67,20 @@ QueueContext::~QueueContext() { } void QueueContext::notify_submit( - std::span<const VkSubmitInfo> infos, - const std::uint64_t target_semaphore_sequence, - std::shared_ptr<TimestampPool::Handle>&& handle) { - - // This has an issue where we're collecting all signals and waits and - // treating a single submit call as finishing + const VkSubmitInfo& info, const std::uint64_t& target_semaphore_sequence, + const std::shared_ptr<TimestampPool::Handle> head_handle, + const std::shared_ptr<TimestampPool::Handle> tail_handle) { auto signals = std::unordered_set<VkSemaphore>{}; auto waits = std::unordered_set<VkSemaphore>{}; - for (const auto& info : infos) { - std::ranges::copy_n(info.pWaitSemaphores, info.waitSemaphoreCount, - std::inserter(waits, std::end(waits))); - std::ranges::copy_n(info.pSignalSemaphores, info.signalSemaphoreCount, - std::inserter(signals, std::end(signals))); - } + std::ranges::copy_n(info.pWaitSemaphores, info.waitSemaphoreCount, + std::inserter(waits, std::end(waits))); + std::ranges::copy_n(info.pSignalSemaphores, info.signalSemaphoreCount, + std::inserter(signals, std::end(signals))); this->submissions.emplace_back(std::make_unique<Submission>( std::move(signals), std::move(waits), target_semaphore_sequence, - std::move(handle))); + head_handle, tail_handle)); // TODO HACK if (std::size(this->submissions) > 100) { @@ -93,6 +88,7 @@ void QueueContext::notify_submit( } } +/* void QueueContext::notify_submit( std::span<const VkSubmitInfo2> infos, const std::uint64_t target_semaphore_sequence, @@ -100,6 +96,7 @@ void QueueContext::notify_submit( auto signals = std::unordered_set<VkSemaphore>{}; auto waits = std::unordered_set<VkSemaphore>{}; + for (const auto& info : infos) { constexpr auto get_semaphore = [](const auto& semaphore_info) { return semaphore_info.semaphore; @@ -124,21 +121,18 @@ void QueueContext::notify_submit( this->submissions.pop_front(); } } +*/ void QueueContext::notify_present(const VkPresentInfoKHR& info) { - auto frame = [&]() -> std::unique_ptr<Frame> { - const auto waits = [&]() { - auto waits = std::unordered_set<VkSemaphore>{}; - std::ranges::copy_n(info.pWaitSemaphores, info.waitSemaphoreCount, - std::inserter(waits, std::end(waits))); - return waits; - }(); - - const auto wait_semaphores = std::unordered_set<VkSemaphore>{ - info.pWaitSemaphores, - std::next(info.pWaitSemaphores, info.waitSemaphoreCount)}; + const auto waits = [&]() { + auto waits = std::unordered_set<VkSemaphore>{}; + std::ranges::copy_n(info.pWaitSemaphores, info.waitSemaphoreCount, + std::inserter(waits, std::end(waits))); + return waits; + }(); + const auto collected_semaphores = [&info, this]() { auto collected_semaphores = std::unordered_set<VkSemaphore>{}; for (auto i = std::uint32_t{0}; i < info.swapchainCount; ++i) { const auto& swapchain = info.pSwapchains[i]; @@ -153,112 +147,147 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) { const auto index_it = swapchain_it->second.find(index); assert(index_it != std::end(swapchain_it->second)); - const auto semaphore = index_it->second; + const auto& semaphore = index_it->second; collected_semaphores.emplace(index_it->second); } + return collected_semaphores; + }(); - const auto start_submission_it = std::ranges::find_if( - std::rbegin(this->submissions), std::rend(this->submissions), - [&](const auto& submission) { - return std::ranges::any_of( - submission->waits, [&](const auto& wait) { - return collected_semaphores.contains(wait); - }); - }); - - if (start_submission_it == std::rend(this->submissions)) { - std::cout << "couldn't find starting submission!\n"; - return nullptr; - } - const auto& start_submission = *start_submission_it; - - const auto end_submission_it = std::ranges::find_if( - std::rbegin(this->submissions), std::rend(this->submissions), - [&](const auto& submission) { - return std::ranges::any_of( - submission->signals, [&](const auto& signal) { - return wait_semaphores.contains(signal); - }); - }); - - if (end_submission_it == std::rend(this->submissions)) { - std::cout << "couldn't find ending submission!\n"; - return nullptr; - } - const auto& end_submission = *end_submission_it; - - return std::make_unique<Frame>(Frame{ - .start_context = *this, - .start = start_submission->timestamp_handle, - .target_start_sequence = - start_submission->target_semaphore_sequence, - .end_context = *this, - .end = start_submission->timestamp_handle, - .target_end_sequence = start_submission->target_semaphore_sequence, + const auto start_iter = std::ranges::find_if( + std::rbegin(this->submissions), std::rend(this->submissions), + [&](const auto& submission) { + return std::ranges::any_of( + submission->waits, [&](const auto& wait) { + return collected_semaphores.contains(wait); + }); }); - }(); - this->in_flight_frames.emplace_back(std::move(frame)); - + if (start_iter == std::rend(this->submissions)) { + std::cout << "couldn't find starting submission!\n"; + return; + } + const auto& start = *start_iter; + + const auto end_iter = std::ranges::find_if( + std::rbegin(this->submissions), std::rend(this->submissions), + [&](const auto& submission) { + return std::ranges::any_of( + submission->signals, + [&](const auto& signal) { return waits.contains(signal); }); + }); + + if (end_iter == std::rend(this->submissions)) { + std::cout << "couldn't find ending submission!\n"; + return; + } + const auto& end = *end_iter; + + auto frame = Frame{.start = + Frame::Timepoint{ + .context = *this, + .handle = start->start_handle, + .sequence = start->sequence, + }, + .end = Frame::Timepoint{ + .context = *this, + .handle = end->end_handle, + .sequence = end->sequence, + }}; + this->in_flight_frames.emplace_back( + std::make_unique<Frame>(std::move(frame))); + // hack if (this->in_flight_frames.size() > 5) { this->in_flight_frames.pop_front(); } } -// now it's all coming together std::optional<QueueContext::duration_t> QueueContext::get_delay_time() { if (!std::size(this->in_flight_frames)) { return std::nullopt; } - auto seq = std::uint64_t{}; - this->device_context.vtable.GetSemaphoreCounterValueKHR( - this->device_context.device, this->semaphore, &seq); - - // Get semaphore first, then poll! - this->timestamp_pool->poll(); + // We are about to query the wait semaphores of all of our current + // frames in flight. They may come from the same device, so we're going + // to build a mapping here to reduce vulkan calls. Not only that, + // we have to do this or else our timing information becomes broken + // as this loop iterates. + const auto target_devices = [this]() -> auto { + using context_ref_t = std::reference_wrapper<DeviceContext>; + auto target_devices = std::unordered_map<VkDevice, context_ref_t>{}; + for (const auto& frame : this->in_flight_frames) { + auto& start = frame->start.context.device_context; + auto& end = frame->end.context.device_context; + + target_devices.try_emplace(start.device, std::ref(start)); + target_devices.try_emplace(end.device, std::ref(end)); + } + return target_devices; + }(); - // idk how frequently we should call this. - this->device_context.calibrate_timestamps(); + // Calibrate timestamps before we acquire semaphores. + for (const auto& pair : target_devices) { + auto& device = pair.second; + device_context.clock.calibrate(); + } - static auto gpu_frametimes = std::deque<uint64_t>{}; - static auto cpu_frametimes = std::deque<uint64_t>{}; + // Now we have all owned devices and their clocks are in a good state. + // We need to build another mapping of semaphores to their queries now. + const auto queue_sequences = [this]() -> auto { + auto queue_sequences = std::unordered_map<VkQueue, std::uint64_t>{}; + for (const auto& frame : this->in_flight_frames) { + auto& start = frame->start.context; + auto& end = frame->end.context; + + for (const auto& queue_ptr : {&start, &end}) { + if (queue_sequences.contains(queue_ptr->queue)) { + continue; + } + + const auto& vtable = queue_ptr->device_context.vtable; + auto seq = std::uint64_t{}; + vtable.GetSemaphoreCounterValueKHR(this->device_context.device, + this->semaphore, &seq); + queue_sequences.emplace(queue_ptr->queue, seq); + } + } + return queue_sequences; + }(); + // Now all devices we are about to query are primed to query. + // We have all sequence numbers from all queus we could possibly query. const auto S = std::size(this->in_flight_frames); + for (auto i = std::size_t{0}; i < S; ++i) { + assert(this->in_flight_frames[i]); + const auto& frame = *this->in_flight_frames[i]; + const auto& start = frame.start; + const auto& end = frame.end; - std::cout << "\nSTART FRAME READOUT\n"; - std::cout << "error bound: " << this->device_context.clock.error_bound - << '\n'; - std::cout << "num frames in flight: " << S << '\n'; - std::cout << "from oldest -> newest\n"; - - // const auto b_seq = semaphore_from_context(*this); - const auto now = std::chrono::steady_clock::now(); - - auto i = std::size_t{0}; - for (; i < std::size(this->in_flight_frames); ++i) { - const auto& frame = this->in_flight_frames[i]; std::cout << " Evaluating the frame that's " << S - i - 1 << " behind\n"; - if (!frame) { - std::cout << " nullptr!\n"; + + std::cout << " target start seq: " << start.sequence << '\n'; + std::cout << " target end seq: " << end.sequence << '\n'; + + const auto start_seq_it = queue_sequences.find(start.context.queue); + assert(start_seq_it != std::end(queue_sequences)); + const auto& start_seq = start_seq_it->second; + if (start_seq < start.sequence) { + std::cout << " frame hasn't started yet !\n "; continue; } - std::cout << " target start: " << frame->target_start_sequence << '\n'; - std::cout << " target end: " << frame->target_end_sequence << '\n'; - if (seq < frame->target_start_sequence) { - std::cout << " frame hasn't started yet!\n"; - continue; + /* + const auto start_ticks_opt = + start.handle->get_ticks(*start.context.timestamp_pool); + if (!start_ticks_opt.has_value()) { + std::cout << " frame hasn't started yet !\n "; } - const auto start_ticks = - frame->start_context.timestamp_pool->get_polled(*frame->start); std::cout << " START TICKS: " << start_ticks << '\n'; - const auto& a_clock = frame->start_context.device_context.clock; - const auto a = a_clock.ticks_to_time(start_ticks); - + const auto start_time = + start.context.device_context.clock.ticks_to_time(start_ticks); + { using namespace std::chrono; const auto diff = now - a; @@ -269,85 +298,161 @@ std::optional<QueueContext::duration_t> QueueContext::get_delay_time() { << " us " << ns << " ns ago\n"; } - if (seq < frame->target_end_sequence) { - std::cout << " frame hasn't ended yet!\n"; + const auto end_seq_it = queue_sequences.find(end.context.queue); + assert(end_seq_it != std::end(queue_sequences)); + const auto& end_seq = end_seq_it->second; + if (start_seq < end.sequence) { + std::cout << " frame hasn't started yet !\n "; continue; } + */ + } + return std::nullopt; + // +} - const auto end_ticks = - frame->end_context.timestamp_pool->get_polled(*frame->end, true); - const auto& b_clock = frame->end_context.device_context.clock; - std::cout << " END_TICKS: " << end_ticks << '\n'; - const auto b = b_clock.ticks_to_time(end_ticks); - { - using namespace std::chrono; - if (now <= b) { - std::cout << "b happened before now?\n"; - } - const auto diff = now - b; - const auto ms = duration_cast<milliseconds>(diff); - const auto us = duration_cast<microseconds>(diff - ms); - const auto ns = duration_cast<nanoseconds>(diff - ms - us); - std::cout << " frame ended: " << ms << " ms " << us - << " us " << ns << " ns ago\n"; - } +// now it's all coming together +// std::optional<QueueContext::duration_t> QueueContext::get_delay_time() { +/* +if (!std::size(this->in_flight_frames)) { + return std::nullopt; +} - const auto gpu_time = b - a; - { - using namespace std::chrono; - const auto diff = gpu_time; - const auto ms = duration_cast<milliseconds>(diff); - const auto us = duration_cast<microseconds>(diff - ms); - const auto ns = duration_cast<nanoseconds>(diff - ms - us); - std::cout << " gpu_time: " << ms << " ms " << us - << " us " << ns << " ns ago\n"; - } +auto seq = std::uint64_t{}; +this->device_context.vtable.GetSemaphoreCounterValueKHR( + this->device_context.device, this->semaphore, &seq); - /* - cpu_frametimes.emplace_back(cpu_time); - gpu_frametimes.emplace_back(gpu_time); - */ - } +// Get semaphore first, then poll! +this->timestamp_pool->poll(); - /* - if (remove_index.has_value()) { - this->in_flight_frames.erase(std::begin(this->in_flight_frames), - std::begin(this->in_flight_frames) + - *remove_index); +// idk how frequently we should call this. +this->device_context.calibrate_timestamps(); + +static auto gpu_frametimes = std::deque<uint64_t>{}; +static auto cpu_frametimes = std::deque<uint64_t>{}; + +const auto S = std::size(this->in_flight_frames); + +std::cout << "\nSTART FRAME READOUT\n"; +std::cout << "error bound: " << this->device_context.clock.error_bound + << '\n'; +std::cout << "num frames in flight: " << S << '\n'; +std::cout << "from oldest -> newest\n"; + +// const auto b_seq = semaphore_from_context(*this); +const auto now = std::chrono::steady_clock::now(); + +auto i = std::size_t{0}; +for (; i < std::size(this->in_flight_frames); ++i) { + const auto& frame = this->in_flight_frames[i]; + std::cout << " Evaluating the frame that's " << S - i - 1 + << " behind\n"; + if (!frame) { + std::cout << " nullptr!\n"; + continue; } - */ - /* - auto g_copy = gpu_frametimes; - auto c_copy = cpu_frametimes; - std::ranges::sort(g_copy); - std::ranges::sort(c_copy); + std::cout << " target start: " << frame->target_start_sequence << +'\n'; std::cout << " target end: " << frame->target_end_sequence << '\n'; if +(seq < frame->target_start_sequence) { std::cout << " frame hasn't +started yet!\n"; continue; + } - constexpr auto N = 49; - if (std::size(cpu_frametimes) < N) { - return std::nullopt; + const auto start_ticks = + frame->start_context.timestamp_pool->get_polled(*frame->start); + std::cout << " START TICKS: " << start_ticks << '\n'; + const auto& a_clock = frame->start_context.device_context.clock; + const auto a = a_clock.ticks_to_time(start_ticks); + + { + using namespace std::chrono; + const auto diff = now - a; + const auto ms = duration_cast<milliseconds>(diff); + const auto us = duration_cast<microseconds>(diff - ms); + const auto ns = duration_cast<nanoseconds>(diff - ms - us); + std::cout << " frame started: " << ms << " ms " << us + << " us " << ns << " ns ago\n"; } - const auto F = std::size(g_copy); - // close enough to median lol - const auto g = g_copy[F / 2]; - const auto c = c_copy[F / 2]; + if (seq < frame->target_end_sequence) { + std::cout << " frame hasn't ended yet!\n"; + continue; + } - std::cout << g << '\n'; - std::cout << " median gpu: " << (g / 1'000'000) << " ms " << g / 1'000 - << " us " << g << " ns\n"; - std::cout << " median cpu: " << c / 1'000'000 << " ms " << c / 1'000 - << " us " << c << " ns\n"; + const auto end_ticks = + frame->end_context.timestamp_pool->get_polled(*frame->end, true); + const auto& b_clock = frame->end_context.device_context.clock; + std::cout << " END_TICKS: " << end_ticks << '\n'; + const auto b = b_clock.ticks_to_time(end_ticks); + { + using namespace std::chrono; + if (now <= b) { + std::cout << "b happened before now?\n"; + } + const auto diff = now - b; + const auto ms = duration_cast<milliseconds>(diff); + const auto us = duration_cast<microseconds>(diff - ms); + const auto ns = duration_cast<nanoseconds>(diff - ms - us); + std::cout << " frame ended: " << ms << " ms " << us + << " us " << ns << " ns ago\n"; + } - if (F > N) { - gpu_frametimes.pop_front(); - cpu_frametimes.pop_front(); + const auto gpu_time = b - a; + { + using namespace std::chrono; + const auto diff = gpu_time; + const auto ms = duration_cast<milliseconds>(diff); + const auto us = duration_cast<microseconds>(diff - ms); + const auto ns = duration_cast<nanoseconds>(diff - ms - us); + std::cout << " gpu_time: " << ms << " ms " << us + << " us " << ns << " ns ago\n"; } - */ + /* + cpu_frametimes.emplace_back(cpu_time); + gpu_frametimes.emplace_back(gpu_time); +} + +/* +if (remove_index.has_value()) { + this->in_flight_frames.erase(std::begin(this->in_flight_frames), + std::begin(this->in_flight_frames) + + *remove_index); +} +*/ + +/* +auto g_copy = gpu_frametimes; +auto c_copy = cpu_frametimes; +std::ranges::sort(g_copy); +std::ranges::sort(c_copy); + +constexpr auto N = 49; +if (std::size(cpu_frametimes) < N) { return std::nullopt; } +const auto F = std::size(g_copy); +// close enough to median lol +const auto g = g_copy[F / 2]; +const auto c = c_copy[F / 2]; + +std::cout << g << '\n'; + +std::cout << " median gpu: " << (g / 1'000'000) << " ms " << g / 1'000 + << " us " << g << " ns\n"; +std::cout << " median cpu: " << c / 1'000'000 << " ms " << c / 1'000 + << " us " << c << " ns\n"; + +if (F > N) { + gpu_frametimes.pop_front(); + cpu_frametimes.pop_front(); +} + +return std::nullopt; +} +*/ + } // namespace low_latency
\ No newline at end of file diff --git a/src/queue_context.hh b/src/queue_context.hh index a6f43e5..3df6af4 100644 --- a/src/queue_context.hh +++ b/src/queue_context.hh @@ -10,7 +10,6 @@ #include <chrono> #include <deque> #include <memory> -#include <span> #include <unordered_set> namespace low_latency { @@ -35,24 +34,26 @@ class QueueContext final : public Context { struct Submission { const std::unordered_set<VkSemaphore> signals; const std::unordered_set<VkSemaphore> waits; - const std::uint64_t target_semaphore_sequence; - const std::shared_ptr<TimestampPool::Handle> timestamp_handle; + const std::uint64_t sequence; + + const std::shared_ptr<TimestampPool::Handle> start_handle; + const std::shared_ptr<TimestampPool::Handle> end_handle; }; std::deque<std::shared_ptr<Submission>> submissions; // In flight frames! // These might come from different contexts. struct Frame { - const QueueContext& start_context; - const std::shared_ptr<TimestampPool::Handle> start; - const std::uint64_t target_start_sequence; - const QueueContext& end_context; - const std::shared_ptr<TimestampPool::Handle> end; - const std::uint64_t target_end_sequence; + struct Timepoint { + const QueueContext& context; + const std::shared_ptr<TimestampPool::Handle> handle; + const std::uint64_t sequence; + }; + + const Timepoint start; + const Timepoint end; }; - // These can be null, it means we made presented without finding the - // timestamps associated with the present. std::deque<std::unique_ptr<Frame>> in_flight_frames; public: @@ -61,12 +62,12 @@ class QueueContext final : public Context { virtual ~QueueContext(); public: - void notify_submit(std::span<const VkSubmitInfo> infos, - const std::uint64_t target_semaphore_sequence, - std::shared_ptr<TimestampPool::Handle>&& handle); - void notify_submit(std::span<const VkSubmitInfo2> infos, - const std::uint64_t target_semaphore_sequence, - std::shared_ptr<TimestampPool::Handle>&& handle); + void + notify_submit(const VkSubmitInfo& info, const std::uint64_t& sequence, + const std::shared_ptr<TimestampPool::Handle> head_handle, + const std::shared_ptr<TimestampPool::Handle> tail_handle); + + // TODO submit2 void notify_present(const VkPresentInfoKHR& info); diff --git a/src/timestamp_pool.cc b/src/timestamp_pool.cc index b4dc3c9..cf48873 100644 --- a/src/timestamp_pool.cc +++ b/src/timestamp_pool.cc @@ -3,142 +3,152 @@ #include "queue_context.hh" #include <ranges> +#include <vulkan/utility/vk_dispatch_table.h> #include <vulkan/vulkan_core.h> namespace low_latency { -TimestampPool::Block TimestampPool::allocate() { - const auto& device_context = this->queue_context.device_context; +TimestampPool::QueryChunk::QueryChunk(const QueueContext& queue_context) { + const auto& device_context = queue_context.device_context; + const auto& vtable = device_context.vtable; - const auto query_pool = [&]() -> VkQueryPool { + this->query_pool = [&]() { const auto qpci = VkQueryPoolCreateInfo{ .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, .queryType = VK_QUERY_TYPE_TIMESTAMP, - .queryCount = this->TIMESTAMP_QUERY_POOL_SIZE}; - - auto query_pool = VkQueryPool{}; + .queryCount = QueryChunk::CHUNK_SIZE}; - device_context.vtable.CreateQueryPool(device_context.device, &qpci, - nullptr, &query_pool); - return query_pool; + auto qp = VkQueryPool{}; + vtable.CreateQueryPool(device_context.device, &qpci, nullptr, &qp); + return qp; }(); - const auto key_range = - std::views::iota(0u, this->TIMESTAMP_QUERY_POOL_SIZE / 2) | - std::views::transform([](const std::uint64_t& i) { return 2 * i; }); - - auto available_indices = std::make_unique<available_query_indicies_t>( - available_query_indicies_t{std::begin(key_range), std::end(key_range)}); - - auto command_buffers = [&, this]() -> auto { - auto command_buffers = - std::vector<VkCommandBuffer>(this->TIMESTAMP_QUERY_POOL_SIZE); + constexpr auto key_range = std::views::iota(0u, QueryChunk::CHUNK_SIZE); + this->free_indices = std::make_unique<free_indices_t>(std::begin(key_range), + std::end(key_range)); + this->command_buffers = [&, this]() -> auto { + auto cbs = std::make_unique<std::vector<VkCommandBuffer>>(CHUNK_SIZE); const auto cbai = VkCommandBufferAllocateInfo{ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, - .commandPool = this->queue_context.command_pool, + .commandPool = queue_context.command_pool, .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, - .commandBufferCount = - static_cast<std::uint32_t>(std::size(command_buffers)), + .commandBufferCount = static_cast<std::uint32_t>(std::size(*cbs)), }; - device_context.vtable.AllocateCommandBuffers( - device_context.device, &cbai, std::data(command_buffers)); - std::ranges::for_each(command_buffers, [&](const auto& cb) { - device_context.sdld(device_context.device, cb); - }); - return std::make_unique<std::vector<VkCommandBuffer>>(command_buffers); + vtable.AllocateCommandBuffers(device_context.device, &cbai, + std::data(*cbs)); + return cbs; }(); - - return Block{.query_pool = query_pool, - .available_indicies = std::move(available_indices), - .command_buffers = std::move(command_buffers)}; } +TimestampPool::QueryChunk::~QueryChunk() {} + TimestampPool::TimestampPool(QueueContext& queue_context) : queue_context(queue_context) { - // Allocate one block on construction, it's likely more than enough! - this->blocks.emplace_back(this->allocate()); + // Allocate one block on construction, it's likely more than enough. + auto query_chunk = std::make_shared<QueryChunk>(this->queue_context); + this->query_chunks.emplace(std::move(query_chunk)); } std::shared_ptr<TimestampPool::Handle> TimestampPool::acquire() { - const auto vacant_iter = [this]() -> auto { - const auto it = - std::ranges::find_if(this->blocks, [](const auto& block) { - return std::size(*block.available_indicies); + + // Gets the empty one, or inserts a new one and returns it. + const auto not_empty_iter = [this]() -> auto { + const auto not_empty_iter = + std::ranges::find_if(this->query_chunks, [](const auto& qc) { + assert(qc); + return std::size(*qc->free_indices); }); - if (it != std::end(this->blocks)) { - return it; + if (not_empty_iter != std::end(this->query_chunks)) { + return not_empty_iter; } - this->blocks.emplace_back(this->allocate()); - return std::prev(std::end(this->blocks)); - }(); - - const auto query_pool = vacant_iter->query_pool; - auto& available_indices = *vacant_iter->available_indicies; - // Grab any element from our set and erase it immediately after. - const auto query_index = *std::begin(available_indices); - available_indices.erase(std::begin(available_indices)); - - const auto command_buffers = [&]() -> auto { - auto command_buffers = std::array<VkCommandBuffer, 2>{}; - std::ranges::copy_n( - std::next(std::begin(*vacant_iter->command_buffers), query_index), - std::size(command_buffers), std::begin(command_buffers)); - return command_buffers; + const auto insert = std::make_shared<QueryChunk>(this->queue_context); + const auto [iter, did_insert] = this->query_chunks.emplace(insert); + assert(did_insert); + return iter; }(); - const auto block_index = static_cast<std::size_t>( - std::distance(std::begin(this->blocks), vacant_iter)); + // Grab any element from our set and erase it immediately after. + auto& indices = *(*not_empty_iter)->free_indices; + const auto query_index = *std::begin(indices); + assert(indices.erase(query_index)); - return std::make_shared<Handle>(available_indices, block_index, query_pool, - query_index, command_buffers); + return std::make_shared<Handle>(*not_empty_iter, query_index); } -TimestampPool::Handle::Handle( - TimestampPool::available_query_indicies_t& index_origin, - const std::size_t block_index, const VkQueryPool& query_pool, - const std::uint64_t query_index, - const std::array<VkCommandBuffer, 2>& command_buffers) - : index_origin(index_origin), block_index(block_index), - query_pool(query_pool), query_index(query_index), - command_buffers(command_buffers) {} +TimestampPool::Handle::Handle(const std::shared_ptr<QueryChunk>& origin_chunk, + const std::uint64_t& query_index) + : query_pool(origin_chunk->query_pool), query_index(query_index), + origin_chunk(origin_chunk), + command_buffer((*origin_chunk->command_buffers)[query_index]) {} TimestampPool::Handle::~Handle() { - assert(this->index_origin.insert(this->query_index).second); + // Parent destructing shouldn't mean we should have a bunch of insertions + // for zero reason. + if (const auto ptr = this->origin_chunk.lock(); ptr) { + assert(ptr->free_indices->insert(this->query_index).second); + } } void TimestampPool::Handle::setup_command_buffers( - const VkuDeviceDispatchTable& vtable) const { - - const auto& [head, tail] = this->command_buffers; + const Handle& tail, const QueueContext& queue_context) const { const auto cbbi = VkCommandBufferBeginInfo{ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, }; - // Heads - vtable.ResetCommandBuffer(head, 0); - vtable.BeginCommandBuffer(head, &cbbi); - // Reset the next two and make them unavailable when they are run! - vtable.CmdResetQueryPool(head, this->query_pool, this->query_index, 2); - vtable.CmdWriteTimestamp2KHR(head, VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, + + const auto& device_context = queue_context.device_context; + const auto& vtable = device_context.vtable; + + vtable.ResetQueryPoolEXT(device_context.device, this->query_pool, + this->query_index, 1); + + vtable.BeginCommandBuffer(this->command_buffer, &cbbi); + vtable.CmdWriteTimestamp2KHR(this->command_buffer, + VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT, this->query_pool, this->query_index); - vtable.EndCommandBuffer(head); - - // Tails - vtable.ResetCommandBuffer(tail, 0); - vtable.BeginCommandBuffer(tail, &cbbi); - vtable.CmdWriteTimestamp2KHR(tail, VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT, - this->query_pool, this->query_index + 1); - vtable.EndCommandBuffer(tail); + vtable.EndCommandBuffer(this->command_buffer); + + vtable.ResetQueryPoolEXT(device_context.device, tail.query_pool, + tail.query_index, 1); + vtable.ResetCommandBuffer(tail.command_buffer, 0); + vtable.BeginCommandBuffer(tail.command_buffer, &cbbi); + vtable.CmdWriteTimestamp2KHR(tail.command_buffer, + VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT, + tail.query_pool, tail.query_index); + vtable.EndCommandBuffer(tail.command_buffer); } -void TimestampPool::poll() { - this->cached_timestamps.clear(); - this->cached_timestamps.reserve(std::size(this->blocks)); +std::optional<std::uint64_t> +TimestampPool::Handle::get_ticks(const TimestampPool& pool) { + + const auto& device_context = pool.queue_context.device_context; + const auto& vtable = device_context.vtable; + + struct QueryResult { + std::uint64_t value; + std::uint64_t available; + }; + auto query_result = QueryResult{}; + + const auto r = vtable.GetQueryPoolResults( + device_context.device, query_pool, this->query_index, 1, + sizeof(query_result), &query_result, sizeof(query_result), + VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WITH_AVAILABILITY_BIT); + + assert(r == VK_SUCCESS || r == VK_NOT_READY); + + if (!query_result.available) { + return std::nullopt; + } + return query_result.value; +} +/* +void TimestampPool::poll() { const auto& device_context = this->queue_context.device_context; std::ranges::transform( @@ -163,26 +173,16 @@ void TimestampPool::poll() { return timestamps; }); }; - -std::uint64_t TimestampPool::get_polled(const Handle& handle, const bool hack) { - - assert(handle.block_index < std::size(this->cached_timestamps)); - - const auto& cached_timestamp = this->cached_timestamps[handle.block_index]; - assert(cached_timestamp != nullptr); - assert(handle.query_index < std::size(*cached_timestamp)); - - return (*cached_timestamp)[handle.query_index + hack]; -} +*/ TimestampPool::~TimestampPool() { const auto& device = this->queue_context.device_context.device; const auto& vtable = this->queue_context.device_context.vtable; - for (const auto& block : this->blocks) { + for (const auto& query_chunk : this->query_chunks) { vtable.FreeCommandBuffers(device, this->queue_context.command_pool, - std::size(*block.command_buffers), - std::data(*block.command_buffers)); - vtable.DestroyQueryPool(device, block.query_pool, nullptr); + std::size(*query_chunk->command_buffers), + std::data(*query_chunk->command_buffers)); + vtable.DestroyQueryPool(device, query_chunk->query_pool, nullptr); } } diff --git a/src/timestamp_pool.hh b/src/timestamp_pool.hh index a4aa429..f69b06f 100644 --- a/src/timestamp_pool.hh +++ b/src/timestamp_pool.hh @@ -40,6 +40,7 @@ #include <memory> #include <unordered_set> +#include <vector> namespace low_latency { @@ -47,58 +48,62 @@ class QueueContext; class TimestampPool final { private: - static constexpr auto TIMESTAMP_QUERY_POOL_SIZE = 512u; - static_assert(TIMESTAMP_QUERY_POOL_SIZE % 2 == 0); - - private: QueueContext& queue_context; - // VkQueryPool with an unordered set of keys available for reading. - using available_query_indicies_t = std::unordered_set<std::uint64_t>; + // A chunk of data which is useful for making timestamp queries. + // Allows association of an index to a query pool and command buffer. + // We reuse these when they're released. + struct QueryChunk final { + private: + using free_indices_t = std::unordered_set<std::uint64_t>; + static constexpr auto CHUNK_SIZE = 512u; - struct Block { + public: VkQueryPool query_pool; - std::unique_ptr<available_query_indicies_t> available_indicies; + std::unique_ptr<free_indices_t> free_indices; std::unique_ptr<std::vector<VkCommandBuffer>> command_buffers; - }; - std::vector<Block> blocks; // multiple blocks - // A snapshot of all available blocks for reading after each poll. - std::vector<std::unique_ptr<std::vector<std::uint64_t>>> cached_timestamps; + public: + QueryChunk(const QueueContext& queue_context); + QueryChunk(const QueryChunk& handle) = delete; + QueryChunk(QueryChunk&&) = delete; + QueryChunk operator=(const QueryChunk& handle) = delete; + QueryChunk operator=(QueryChunk&&) = delete; + ~QueryChunk(); + }; + std::unordered_set<std::shared_ptr<QueryChunk>> query_chunks; public: - // A handle represents two std::uint64_t blocks oftimestamp memory and two - // command buffers. + // A handle represents a VkCommandBuffer and a query index. + // Once the Handle goes out of scope, the query index will be returned + // to the parent pool. struct Handle final { private: friend class TimestampPool; private: - available_query_indicies_t& index_origin; - const std::size_t block_index; + const std::weak_ptr<QueryChunk> origin_chunk; public: const VkQueryPool query_pool; const std::uint64_t query_index; - const std::array<VkCommandBuffer, 2> command_buffers; + const VkCommandBuffer command_buffer; public: - Handle(TimestampPool::available_query_indicies_t& index_origin, - const std::size_t block_index, const VkQueryPool& query_pool, - const std::uint64_t query_index, - const std::array<VkCommandBuffer, 2>& command_buffers); + Handle(const std::shared_ptr<QueryChunk>& origin_chunk, + const std::uint64_t& query_index); Handle(const Handle& handle) = delete; Handle(Handle&&) = delete; Handle operator=(const Handle& handle) = delete; Handle operator=(Handle&&) = delete; - ~Handle(); // frees from the pool + ~Handle(); public: - void setup_command_buffers(const VkuDeviceDispatchTable& vtable) const; - }; + void setup_command_buffers(const Handle& tail, + const QueueContext& queue_context) const; - private: - Block allocate(); + std::optional<std::uint64_t> get_ticks(const TimestampPool& pool); + }; public: TimestampPool(QueueContext& queue_context); @@ -109,12 +114,8 @@ class TimestampPool final { ~TimestampPool(); public: - // Hands out a Handle with a pool and index of two uint64_t's. + // Hands out a Handle! std::shared_ptr<Handle> acquire(); - - void poll(); // saves the current state for future get's. - - std::uint64_t get_polled(const Handle& handle, const bool hack = false); }; } // namespace low_latency |
