1 files changed, 267 insertions, 162 deletions
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 9b46773..99cf51e 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -67,25 +67,20 @@ QueueContext::~QueueContext() {
 }
 
 void QueueContext::notify_submit(
-    std::span<const VkSubmitInfo> infos,
-    const std::uint64_t target_semaphore_sequence,
-    std::shared_ptr<TimestampPool::Handle>&& handle) {
-
-    // This has an issue where we're collecting all signals and waits and
-    // treating a single submit call as finishing
+    const VkSubmitInfo& info, const std::uint64_t& target_semaphore_sequence,
+    const std::shared_ptr<TimestampPool::Handle> head_handle,
+    const std::shared_ptr<TimestampPool::Handle> tail_handle) {
 
     auto signals = std::unordered_set<VkSemaphore>{};
     auto waits = std::unordered_set<VkSemaphore>{};
-    for (const auto& info : infos) {
-        std::ranges::copy_n(info.pWaitSemaphores, info.waitSemaphoreCount,
-                            std::inserter(waits, std::end(waits)));
-        std::ranges::copy_n(info.pSignalSemaphores, info.signalSemaphoreCount,
-                            std::inserter(signals, std::end(signals)));
-    }
+    std::ranges::copy_n(info.pWaitSemaphores, info.waitSemaphoreCount,
+                        std::inserter(waits, std::end(waits)));
+    std::ranges::copy_n(info.pSignalSemaphores, info.signalSemaphoreCount,
+                        std::inserter(signals, std::end(signals)));
 
     this->submissions.emplace_back(std::make_unique<Submission>(
         std::move(signals), std::move(waits), target_semaphore_sequence,
-        std::move(handle)));
+        head_handle, tail_handle));
 
     // TODO HACK
     if (std::size(this->submissions) > 100) {
@@ -93,6 +88,7 @@ void QueueContext::notify_submit(
     }
 }
 
+/*
 void QueueContext::notify_submit(
     std::span<const VkSubmitInfo2> infos,
     const std::uint64_t target_semaphore_sequence,
@@ -100,6 +96,7 @@ void QueueContext::notify_submit(
 
     auto signals = std::unordered_set<VkSemaphore>{};
     auto waits = std::unordered_set<VkSemaphore>{};
+
     for (const auto& info : infos) {
         constexpr auto get_semaphore = [](const auto& semaphore_info) {
             return semaphore_info.semaphore;
@@ -124,21 +121,18 @@ void QueueContext::notify_submit(
         this->submissions.pop_front();
     }
 }
+*/
 
 void QueueContext::notify_present(const VkPresentInfoKHR& info) {
 
-    auto frame = [&]() -> std::unique_ptr<Frame> {
-        const auto waits = [&]() {
-            auto waits = std::unordered_set<VkSemaphore>{};
-            std::ranges::copy_n(info.pWaitSemaphores, info.waitSemaphoreCount,
-                                std::inserter(waits, std::end(waits)));
-            return waits;
-        }();
-
-        const auto wait_semaphores = std::unordered_set<VkSemaphore>{
-            info.pWaitSemaphores,
-            std::next(info.pWaitSemaphores, info.waitSemaphoreCount)};
+    const auto waits = [&]() {
+        auto waits = std::unordered_set<VkSemaphore>{};
+        std::ranges::copy_n(info.pWaitSemaphores, info.waitSemaphoreCount,
+                            std::inserter(waits, std::end(waits)));
+        return waits;
+    }();
 
+    const auto collected_semaphores = [&info, this]() {
         auto collected_semaphores = std::unordered_set<VkSemaphore>{};
         for (auto i = std::uint32_t{0}; i < info.swapchainCount; ++i) {
             const auto& swapchain = info.pSwapchains[i];
@@ -153,112 +147,147 @@ void QueueContext::notify_present(const VkPresentInfoKHR& info) {
             const auto index_it = swapchain_it->second.find(index);
             assert(index_it != std::end(swapchain_it->second));
 
-            const auto semaphore = index_it->second;
+            const auto& semaphore = index_it->second;
             collected_semaphores.emplace(index_it->second);
         }
+        return collected_semaphores;
+    }();
 
-        const auto start_submission_it = std::ranges::find_if(
-            std::rbegin(this->submissions), std::rend(this->submissions),
-            [&](const auto& submission) {
-                return std::ranges::any_of(
-                    submission->waits, [&](const auto& wait) {
-                        return collected_semaphores.contains(wait);
-                    });
-            });
-
-        if (start_submission_it == std::rend(this->submissions)) {
-            std::cout << "couldn't find starting submission!\n";
-            return nullptr;
-        }
-        const auto& start_submission = *start_submission_it;
-
-        const auto end_submission_it = std::ranges::find_if(
-            std::rbegin(this->submissions), std::rend(this->submissions),
-            [&](const auto& submission) {
-                return std::ranges::any_of(
-                    submission->signals, [&](const auto& signal) {
-                        return wait_semaphores.contains(signal);
-                    });
-            });
-
-        if (end_submission_it == std::rend(this->submissions)) {
-            std::cout << "couldn't find ending submission!\n";
-            return nullptr;
-        }
-        const auto& end_submission = *end_submission_it;
-
-        return std::make_unique<Frame>(Frame{
-            .start_context = *this,
-            .start = start_submission->timestamp_handle,
-            .target_start_sequence =
-                start_submission->target_semaphore_sequence,
-            .end_context = *this,
-            .end = start_submission->timestamp_handle,
-            .target_end_sequence = start_submission->target_semaphore_sequence,
+    const auto start_iter = std::ranges::find_if(
+        std::rbegin(this->submissions), std::rend(this->submissions),
+        [&](const auto& submission) {
+            return std::ranges::any_of(
+                submission->waits, [&](const auto& wait) {
+                    return collected_semaphores.contains(wait);
+                });
         });
-    }();
 
-    this->in_flight_frames.emplace_back(std::move(frame));
-    
+    if (start_iter == std::rend(this->submissions)) {
+        std::cout << "couldn't find starting submission!\n";
+        return;
+    }
+    const auto& start = *start_iter;
+
+    const auto end_iter = std::ranges::find_if(
+        std::rbegin(this->submissions), std::rend(this->submissions),
+        [&](const auto& submission) {
+            return std::ranges::any_of(
+                submission->signals,
+                [&](const auto& signal) { return waits.contains(signal); });
+        });
+
+    if (end_iter == std::rend(this->submissions)) {
+        std::cout << "couldn't find ending submission!\n";
+        return;
+    }
+    const auto& end = *end_iter;
+
+    auto frame = Frame{.start =
+                           Frame::Timepoint{
+                               .context = *this,
+                               .handle = start->start_handle,
+                               .sequence = start->sequence,
+                           },
+                       .end = Frame::Timepoint{
+                           .context = *this,
+                           .handle = end->end_handle,
+                           .sequence = end->sequence,
+                       }};
+    this->in_flight_frames.emplace_back(
+        std::make_unique<Frame>(std::move(frame)));
+
     // hack
     if (this->in_flight_frames.size() > 5) {
         this->in_flight_frames.pop_front();
     }
 }
 
-// now it's all coming together
 std::optional<QueueContext::duration_t> QueueContext::get_delay_time() {
     if (!std::size(this->in_flight_frames)) {
         return std::nullopt;
     }
 
-    auto seq = std::uint64_t{};
-    this->device_context.vtable.GetSemaphoreCounterValueKHR(
-        this->device_context.device, this->semaphore, &seq);
-
-    // Get semaphore first, then poll!
-    this->timestamp_pool->poll();
+    // We are about to query the wait semaphores of all of our current
+    // frames in flight. They may come from the same device, so we're going
+    // to build a mapping here to reduce vulkan calls. Not only that,
+    // we have to do this or else our timing information becomes broken
+    // as this loop iterates.
+    const auto target_devices = [this]() -> auto {
+        using context_ref_t = std::reference_wrapper<DeviceContext>;
+        auto target_devices = std::unordered_map<VkDevice, context_ref_t>{};
+        for (const auto& frame : this->in_flight_frames) {
+            auto& start = frame->start.context.device_context;
+            auto& end = frame->end.context.device_context;
+
+            target_devices.try_emplace(start.device, std::ref(start));
+            target_devices.try_emplace(end.device, std::ref(end));
+        }
+        return target_devices;
+    }();
 
-    // idk how frequently we should call this.
-    this->device_context.calibrate_timestamps();
+    // Calibrate timestamps before we acquire semaphores.
+    for (const auto& pair : target_devices) {
+        auto& device = pair.second;
+        device_context.clock.calibrate();
+    }
 
-    static auto gpu_frametimes = std::deque<uint64_t>{};
-    static auto cpu_frametimes = std::deque<uint64_t>{};
+    // Now we have all owned devices and their clocks are in a good state.
+    // We need to build another mapping of semaphores to their queries now.
+    const auto queue_sequences = [this]() -> auto {
+        auto queue_sequences = std::unordered_map<VkQueue, std::uint64_t>{};
+        for (const auto& frame : this->in_flight_frames) {
+            auto& start = frame->start.context;
+            auto& end = frame->end.context;
+
+            for (const auto& queue_ptr : {&start, &end}) {
+                if (queue_sequences.contains(queue_ptr->queue)) {
+                    continue;
+                }
+
+                const auto& vtable = queue_ptr->device_context.vtable;
+                auto seq = std::uint64_t{};
+                vtable.GetSemaphoreCounterValueKHR(this->device_context.device,
+                                                   this->semaphore, &seq);
+                queue_sequences.emplace(queue_ptr->queue, seq);
+            }
+        }
+        return queue_sequences;
+    }();
 
+    // Now all devices we are about to query are primed to query.
+    // We have all sequence numbers from all queus we could possibly query.
     const auto S = std::size(this->in_flight_frames);
+    for (auto i = std::size_t{0}; i < S; ++i) {
+        assert(this->in_flight_frames[i]);
+        const auto& frame = *this->in_flight_frames[i];
+        const auto& start = frame.start;
+        const auto& end = frame.end;
 
-    std::cout << "\nSTART FRAME READOUT\n";
-    std::cout << "error bound: " << this->device_context.clock.error_bound
-              << '\n';
-    std::cout << "num frames in flight: " << S << '\n';
-    std::cout << "from oldest -> newest\n";
-
-    // const auto b_seq = semaphore_from_context(*this);
-    const auto now = std::chrono::steady_clock::now();
-
-    auto i = std::size_t{0};
-    for (; i < std::size(this->in_flight_frames); ++i) {
-        const auto& frame = this->in_flight_frames[i];
         std::cout << "    Evaluating the frame that's " << S - i - 1
                   << " behind\n";
-        if (!frame) {
-            std::cout << "        nullptr!\n";
+
+        std::cout << "    target start seq: " << start.sequence << '\n';
+        std::cout << "    target end seq: " << end.sequence << '\n';
+
+        const auto start_seq_it = queue_sequences.find(start.context.queue);
+        assert(start_seq_it != std::end(queue_sequences));
+        const auto& start_seq = start_seq_it->second;
+        if (start_seq < start.sequence) {
+            std::cout << "        frame hasn't started yet !\n ";
             continue;
         }
 
-        std::cout << "    target start: " << frame->target_start_sequence << '\n';
-        std::cout << "    target end: " << frame->target_end_sequence << '\n';
-        if (seq < frame->target_start_sequence) {
-            std::cout << "        frame hasn't started yet!\n";
-            continue;
+        /*
+        const auto start_ticks_opt =
+            start.handle->get_ticks(*start.context.timestamp_pool);
+        if (!start_ticks_opt.has_value()) {
+            std::cout << "        frame hasn't started yet !\n ";
         }
 
-        const auto start_ticks =
-            frame->start_context.timestamp_pool->get_polled(*frame->start);
         std::cout << "        START TICKS: " << start_ticks << '\n';
-        const auto& a_clock = frame->start_context.device_context.clock;
-        const auto a = a_clock.ticks_to_time(start_ticks);
-        
+        const auto start_time =
+            start.context.device_context.clock.ticks_to_time(start_ticks);
+
         {
             using namespace std::chrono;
             const auto diff = now - a;
@@ -269,85 +298,161 @@ std::optional<QueueContext::duration_t> QueueContext::get_delay_time() {
                       << " us " << ns << " ns ago\n";
         }
 
-        if (seq < frame->target_end_sequence) {
-            std::cout << "        frame hasn't ended yet!\n";
+        const auto end_seq_it = queue_sequences.find(end.context.queue);
+        assert(end_seq_it != std::end(queue_sequences));
+        const auto& end_seq = end_seq_it->second;
+        if (start_seq < end.sequence) {
+            std::cout << "        frame hasn't started yet !\n ";
             continue;
         }
+        */
+    }
 
+    return std::nullopt;
+    //
+}
 
-        const auto end_ticks =
-            frame->end_context.timestamp_pool->get_polled(*frame->end, true);
-        const auto& b_clock = frame->end_context.device_context.clock;
-        std::cout << "        END_TICKS: " << end_ticks << '\n';
-        const auto b = b_clock.ticks_to_time(end_ticks);
-        {
-            using namespace std::chrono;
-            if (now <= b) {
-                std::cout << "b happened before now?\n";
-            }
-            const auto diff = now - b;
-            const auto ms = duration_cast<milliseconds>(diff);
-            const auto us = duration_cast<microseconds>(diff - ms);
-            const auto ns = duration_cast<nanoseconds>(diff - ms - us);
-            std::cout << "        frame ended: " << ms << " ms " << us
-                      << " us " << ns << " ns ago\n";
-        }
+// now it's all coming together
+// std::optional<QueueContext::duration_t> QueueContext::get_delay_time() {
+/*
+if (!std::size(this->in_flight_frames)) {
+    return std::nullopt;
+}
 
-        const auto gpu_time = b - a;
-        {
-            using namespace std::chrono;
-            const auto diff = gpu_time;
-            const auto ms = duration_cast<milliseconds>(diff);
-            const auto us = duration_cast<microseconds>(diff - ms);
-            const auto ns = duration_cast<nanoseconds>(diff - ms - us);
-            std::cout << "        gpu_time: " << ms << " ms " << us
-                      << " us " << ns << " ns ago\n";
-        }
+auto seq = std::uint64_t{};
+this->device_context.vtable.GetSemaphoreCounterValueKHR(
+    this->device_context.device, this->semaphore, &seq);
 
-        /*
-        cpu_frametimes.emplace_back(cpu_time);
-        gpu_frametimes.emplace_back(gpu_time);
-        */
-    }
+// Get semaphore first, then poll!
+this->timestamp_pool->poll();
 
-    /*
-    if (remove_index.has_value()) {
-        this->in_flight_frames.erase(std::begin(this->in_flight_frames),
-                                     std::begin(this->in_flight_frames) +
-                                         *remove_index);
+// idk how frequently we should call this.
+this->device_context.calibrate_timestamps();
+
+static auto gpu_frametimes = std::deque<uint64_t>{};
+static auto cpu_frametimes = std::deque<uint64_t>{};
+
+const auto S = std::size(this->in_flight_frames);
+
+std::cout << "\nSTART FRAME READOUT\n";
+std::cout << "error bound: " << this->device_context.clock.error_bound
+          << '\n';
+std::cout << "num frames in flight: " << S << '\n';
+std::cout << "from oldest -> newest\n";
+
+// const auto b_seq = semaphore_from_context(*this);
+const auto now = std::chrono::steady_clock::now();
+
+auto i = std::size_t{0};
+for (; i < std::size(this->in_flight_frames); ++i) {
+    const auto& frame = this->in_flight_frames[i];
+    std::cout << "    Evaluating the frame that's " << S - i - 1
+              << " behind\n";
+    if (!frame) {
+        std::cout << "        nullptr!\n";
+        continue;
     }
-    */
 
-    /*
-    auto g_copy = gpu_frametimes;
-    auto c_copy = cpu_frametimes;
-    std::ranges::sort(g_copy);
-    std::ranges::sort(c_copy);
+    std::cout << "    target start: " << frame->target_start_sequence <<
+'\n'; std::cout << "    target end: " << frame->target_end_sequence << '\n'; if
+(seq < frame->target_start_sequence) { std::cout << "        frame hasn't
+started yet!\n"; continue;
+    }
 
-    constexpr auto N = 49;
-    if (std::size(cpu_frametimes) < N) {
-        return std::nullopt;
+    const auto start_ticks =
+        frame->start_context.timestamp_pool->get_polled(*frame->start);
+    std::cout << "        START TICKS: " << start_ticks << '\n';
+    const auto& a_clock = frame->start_context.device_context.clock;
+    const auto a = a_clock.ticks_to_time(start_ticks);
+
+    {
+        using namespace std::chrono;
+        const auto diff = now - a;
+        const auto ms = duration_cast<milliseconds>(diff);
+        const auto us = duration_cast<microseconds>(diff - ms);
+        const auto ns = duration_cast<nanoseconds>(diff - ms - us);
+        std::cout << "        frame started: " << ms << " ms " << us
+                  << " us " << ns << " ns ago\n";
     }
 
-    const auto F = std::size(g_copy);
-    // close enough to median lol
-    const auto g = g_copy[F / 2];
-    const auto c = c_copy[F / 2];
+    if (seq < frame->target_end_sequence) {
+        std::cout << "        frame hasn't ended yet!\n";
+        continue;
+    }
 
-    std::cout << g << '\n';
 
-    std::cout << "    median gpu: " << (g / 1'000'000) << " ms " << g / 1'000
-              << " us " << g << " ns\n";
-    std::cout << "    median cpu: " << c / 1'000'000 << " ms " << c / 1'000
-              << " us " << c << " ns\n";
+    const auto end_ticks =
+        frame->end_context.timestamp_pool->get_polled(*frame->end, true);
+    const auto& b_clock = frame->end_context.device_context.clock;
+    std::cout << "        END_TICKS: " << end_ticks << '\n';
+    const auto b = b_clock.ticks_to_time(end_ticks);
+    {
+        using namespace std::chrono;
+        if (now <= b) {
+            std::cout << "b happened before now?\n";
+        }
+        const auto diff = now - b;
+        const auto ms = duration_cast<milliseconds>(diff);
+        const auto us = duration_cast<microseconds>(diff - ms);
+        const auto ns = duration_cast<nanoseconds>(diff - ms - us);
+        std::cout << "        frame ended: " << ms << " ms " << us
+                  << " us " << ns << " ns ago\n";
+    }
 
-    if (F > N) {
-        gpu_frametimes.pop_front();
-        cpu_frametimes.pop_front();
+    const auto gpu_time = b - a;
+    {
+        using namespace std::chrono;
+        const auto diff = gpu_time;
+        const auto ms = duration_cast<milliseconds>(diff);
+        const auto us = duration_cast<microseconds>(diff - ms);
+        const auto ns = duration_cast<nanoseconds>(diff - ms - us);
+        std::cout << "        gpu_time: " << ms << " ms " << us
+                  << " us " << ns << " ns ago\n";
     }
-    */
 
+    /*
+    cpu_frametimes.emplace_back(cpu_time);
+    gpu_frametimes.emplace_back(gpu_time);
+}
+
+/*
+if (remove_index.has_value()) {
+    this->in_flight_frames.erase(std::begin(this->in_flight_frames),
+                                 std::begin(this->in_flight_frames) +
+                                     *remove_index);
+}
+*/
+
+/*
+auto g_copy = gpu_frametimes;
+auto c_copy = cpu_frametimes;
+std::ranges::sort(g_copy);
+std::ranges::sort(c_copy);
+
+constexpr auto N = 49;
+if (std::size(cpu_frametimes) < N) {
     return std::nullopt;
 }
 
+const auto F = std::size(g_copy);
+// close enough to median lol
+const auto g = g_copy[F / 2];
+const auto c = c_copy[F / 2];
+
+std::cout << g << '\n';
+
+std::cout << "    median gpu: " << (g / 1'000'000) << " ms " << g / 1'000
+          << " us " << g << " ns\n";
+std::cout << "    median cpu: " << c / 1'000'000 << " ms " << c / 1'000
+          << " us " << c << " ns\n";
+
+if (F > N) {
+    gpu_frametimes.pop_front();
+    cpu_frametimes.pop_front();
+}
+
+return std::nullopt;
+}
+*/
+
 } // namespace low_latency
 \ No newline at end of file