2 files changed, 177 insertions, 314 deletions
diff --git a/src/queue_context.cc b/src/queue_context.cc
index 90039f2..7121898 100644
--- a/src/queue_context.cc
+++ b/src/queue_context.cc
@@ -6,6 +6,7 @@
 #include <chrono>
 #include <iostream>
 #include <limits>
+#include <ranges>
 #include <span>
 
 namespace low_latency {
@@ -141,119 +142,41 @@ void QueueContext::notify_submit(
 
 void QueueContext::notify_present(const VkPresentInfoKHR& info) {
 
-    const auto waits = [&]() {
-        auto waits = std::unordered_set<VkSemaphore>{};
-        std::ranges::copy_n(info.pWaitSemaphores, info.waitSemaphoreCount,
-                            std::inserter(waits, std::end(waits)));
-        return waits;
-    }();
-
-    const auto collected_semaphores = [&info, this]() {
-        auto collected_semaphores = std::unordered_set<VkSemaphore>{};
-        for (auto i = std::uint32_t{0}; i < info.swapchainCount; ++i) {
-            const auto& swapchain = info.pSwapchains[i];
-            const auto& index = info.pImageIndices[i];
-
-            // Shouldn't be possible to present to a swapchain that wasn't
-            // waited in
-
-            const auto& signals = this->device_context.swapchain_signals;
-            const auto swapchain_it = signals.find(swapchain);
-            assert(swapchain_it != std::end(signals));
-            const auto index_it = swapchain_it->second.find(index);
-            assert(index_it != std::end(swapchain_it->second));
-
-            const auto& semaphore = index_it->second;
-            collected_semaphores.emplace(index_it->second);
-        }
-        return collected_semaphores;
-    }();
-
-    const auto acquire_iter = std::ranges::find_if(
-        std::rbegin(this->submissions), std::rend(this->submissions),
-        [&](const auto& submission) {
-            return std::ranges::any_of(
-                submission->waits, [&](const auto& wait) {
-                    return collected_semaphores.contains(wait);
-                });
-        });
-
-    if (acquire_iter == std::rend(this->submissions)) {
-        std::cerr << "couldn't find starting submission!\n";
-        return;
-    }
-    const auto& acquire = *acquire_iter;
-    (*acquire_iter)->debug += "acquire ";
-
-    const auto present_iter = std::ranges::find_if(
-        std::rbegin(this->submissions), std::rend(this->submissions),
-        [&](const auto& submission) {
-            return std::ranges::any_of(
-                submission->signals, [&](const auto& signal) {
-                    if (waits.contains(signal)) {
-                        std::cerr
-                            << "queue with signal matching present iter found: "
-                            << signal << '\n';
-                    }
-
-                    return waits.contains(signal);
-                });
-        });
-
-    if (present_iter == std::rend(this->submissions)) {
-        std::cerr << "couldn't find ending submission!\n";
+    // We are going to assume that all queue submissions before and on the same
+    // queue contribute to the frame.
+
+    // This used to be more complicated where we found the first submission that
+    // was signalled by acquire, then we walked forwards until we found the
+    // submission before it that marked the end of frame (which was the last
+    // submission in the previous frame that called notify submit). This seemed
+    // completely redundant, in all cases it was exactly what we have here. But
+    // I could be wrong.
+
+    // that wa
+    const auto start_iter = std::begin(this->submissions);
+    // no op submit?
+    if (start_iter == std::end(this->submissions)) {
+        std::cerr << "ignored no op submit\n";
         return;
     }
-    (*present_iter)->debug += "present ";
-    const auto& end = *present_iter;
-
-    std::cerr << "present for queue: " << queue << ", our waits:\n";
-    for (const auto& wait : waits) {
-        std::cerr << "      " << wait << '\n';
-    }
-
-    // The work including and between acquire -> present is effectively
-    // guaranteed to contribute to our frame. We are going to mark this point
-    // for future queues to read the 'start of frame' from.
-    (*present_iter)->end_of_frame_marker = true;
-
-    // Now we read backwards to try to find our true start, starting at our
-    // acquire.
-    const auto after_previous_present_iter = std::prev(std::ranges::find_if(
-        std::next(acquire_iter), std::rend(this->submissions),
-        [](const auto& submission) {
-            return submission->end_of_frame_marker;
-        }));
-
-    (*after_previous_present_iter)->debug += "after_previous_present ";
-
-    this->submissions.back()->debug += "last_in_host_present ";
-
-    std::cerr << "the present iter was dist from back of: "
-              << std::distance(std::rbegin(this->submissions), present_iter)
-              << '\n';
-    std::cerr << "the acquire iter was dist from back of: "
-              << std::distance(std::rbegin(this->submissions), acquire_iter)
-              << '\n';
-    std::cerr << "the before_previous_present iter was dist from back of: "
-              << std::distance(std::rbegin(this->submissions),
-                               after_previous_present_iter)
-              << '\n';
-
-    auto frame =
-        Frame{.start =
-                  Frame::Timepoint{
-                      .context = *this,
-                      .handle = (*after_previous_present_iter)->start_handle,
-                      .sequence = (*after_previous_present_iter)->sequence,
-                  },
-              .end = Frame::Timepoint{
-                  .context = *this,
-                  .handle = (*present_iter)->end_handle,
-                  .sequence = (*present_iter)->sequence,
-              }};
-    this->in_flight_frames.emplace_back(
-        std::make_unique<Frame>(std::move(frame)));
+    const auto last_iter = std::prev(std::end(this->submissions));
+
+    (*start_iter)->debug += "first_during_present ";
+    (*last_iter)->debug += "last_during_present ";
+
+    // TODO We probably want to walk through other graphics queues and get
+    // the time that they submitted at some stage. Somewhat difficult to do,
+    // we probably need to create a graph of dependencies but it seems like
+    // overengineering. I will just get this working well before I deal
+    // with complicated edge cases.
+
+    this->in_flight_frames.emplace_back(Frame{
+        .submissions = std::move(this->submissions),
+        .sequence = (*last_iter)->sequence,
+    });
+    assert(std::size(this->in_flight_frames.back().submissions));
+    // *valid but unspecified state after move, so clear!*
+    this->submissions.clear();
 }
 
 const auto debug_log_time = [](const auto& diff) {
@@ -269,74 +192,106 @@ void QueueContext::process_frames() {
         return;
     }
 
-    // Collect all devices and call calibrate.
-    [this]() -> void {
-        using context_ref_t = std::reference_wrapper<DeviceContext>;
-        auto target_devices = std::unordered_map<VkDevice, context_ref_t>{};
-        for (const auto& frame : this->in_flight_frames) {
-            auto& start = frame->start.context.device_context;
-            auto& end = frame->end.context.device_context;
-
-            target_devices.try_emplace(start.device, std::ref(start));
-            target_devices.try_emplace(end.device, std::ref(end));
-        }
-        for (const auto& pair : target_devices) {
-            auto& device = pair.second.get();
-            device.clock.calibrate();
-        }
+    // We used to collect all devices that were pointed to by all potential
+    // submissions, put them in a set and then call.calibrate() on each once.
+    // This is unnecessary now - we can assume all submissions come from the
+    // same queue (this one!).
+    auto& device_context = this->device_context;
+    auto& clock = device_context.clock;
+    clock.calibrate();
+
+    // Get the queue's sequence number so we can quickly check
+    // frames are finished without calling getCalibratedTimestamps.
+    // This is somewhat a premature optimization but it's elegant.
+    const auto seq = [&, this]() -> auto {
+        auto seq = std::uint64_t{0};
+        device_context.vtable.GetSemaphoreCounterValueKHR(
+            device_context.device, this->semaphore, &seq);
+        return seq;
     }();
 
-    const auto get_tick_time = [](const auto& timepoint)
-        -> std::optional<DeviceContext::Clock::time_point_t> {
-        const auto& handle = timepoint.handle;
-        const auto& context = timepoint.context;
-
-        const auto ticks = handle->get_ticks(*context.timestamp_pool);
-        if (!ticks.has_value()) {
-            return std::nullopt;
-        }
-        const auto& clock = context.device_context.clock;
-        return clock.ticks_to_time(*ticks);
-    };
-
-    std::cerr << "starting frame readout\n";
     while (std::size(this->in_flight_frames)) {
         const auto& frame = this->in_flight_frames.front();
-        assert(frame);
 
-        const auto a = get_tick_time(frame->start);
-        if (!a.has_value()) {
-            break;
-        }
+        // There should be at least one submission, we guarantee it in
+        // notify_present.
+        assert(std::size(frame.submissions));
+
+        const auto& last_submission = frame.submissions.back();
 
-        const auto b = get_tick_time(frame->end);
-        if (!b.has_value()) {
+        // Not completed (so future frames definitely aren't) - stop early.
+        if (seq < last_submission->sequence) {
             break;
         }
 
-        // We are committed to removing the frame at this stage and promoting it
-        // to a 'timing' struct because it's completed..
+        // We are committed to removing the frame at this stage and
+        // promoting it to a 'timing' struct because it's completed.
+        // We can guarantee that we can extract timing information from
+        // all start/end handles now.
+
+        // Using leetcode merge intervals in the wild lol
+        struct Interval {
+            DeviceContext::Clock::time_point_t start, end;
+        };
+
+        const auto sorted_intervals = [&, this]() -> auto {
+            auto intervals = std::vector<Interval>{};
+            std::ranges::transform(
+                frame.submissions, std::back_inserter(intervals),
+                [&, this](const auto& submission) {
+                    const auto get_time = [&, this](const auto& handle) {
+                        const auto t = handle->get_ticks(*this->timestamp_pool);
+                        assert(t.has_value()); // guaranteed from seq.
+                        return clock.ticks_to_time(*t);
+                    };
+
+                    return Interval{
+                        .start = get_time(submission->start_handle),
+                        .end = get_time(submission->end_handle),
+                    };
+                });
 
-        const auto frametime = *b - *a;
+            std::ranges::sort(intervals, [](const auto& a, const auto& b) {
+                return a.start < b.start;
+            });
+            return intervals;
+        }();
 
-        const auto cpu_time =
-            [&]() -> DeviceContext::Clock::time_point_t::duration {
-            const auto latest_iter = std::rbegin(this->timings);
-            if (latest_iter == std::rend(this->timings)) {
-                return DeviceContext::Clock::time_point_t::duration::zero();
+        const auto merged = [&]() -> auto {
+            auto merged = std::vector<Interval>{};
+            auto last = sorted_intervals[0];
+
+            for (const auto& [s, e] : sorted_intervals | std::views::drop(1)) {
+                if (s <= last.end) {
+                    last.end = std::max(last.end, e);
+                } else {
+                    merged.push_back(last);
+                    last = {s, e};
+                }
             }
-            return *a - (*latest_iter)->gpu_end;
+            merged.push_back(last);
+            return merged;
         }();
 
-        std::cerr
-            << "        calculated total time from last frame (frametime): ";
-        debug_log_time(*b - *a);
+        const auto gputime = std::ranges::fold_left(
+            merged, DeviceContext::Clock::time_point_t::duration{},
+            [](auto gputime, const auto& interval) {
+                const auto& [start, end] = interval;
+                return gputime + (end - start);
+            });
+
+        const auto start = merged.front().start;
+        const auto end = merged.back().end;
+        const auto not_gputime =
+            (merged.back().end - merged.front().start) - gputime;
+
+        auto timing = Timing{
+            .gputime = gputime,
+            .not_gputime = not_gputime,
+            .frame = frame,
+        };
+        this->timings.emplace_back(std::make_unique<Timing>(timing));
 
-        this->timings.emplace_back(std::make_unique<Timing>(
-            Timing{.gpu_end = *b,
-                   .gpu_time = frametime,
-                   .cpu_time = cpu_time,
-                   .frame = std::move(this->in_flight_frames.front())}));
         this->in_flight_frames.pop_front();
     }
 
@@ -355,22 +310,23 @@ void QueueContext::sleep_in_present() {
     const auto& vtable = device.vtable;
 
     // Call this to push all in flight frames into our timings structure,
-    // but only if they're completed. So now they are truly *in flight frames*.
+    // but only if they're completed. So now they are truly *in flight
+    // frames*.
     this->process_frames();
 
     if (const auto F = std::size(this->in_flight_frames); F > 1) {
-        // In this case, we are so far ahead that there are multiple frames in
-        // flight. Either that, or our bookkeeping has gone horribly wrong! Wait
-        // on the 2nd last frame in flight to complete. This shunts us to F=1.
+        // In this case, we are so far ahead that there are multiple frames
+        // in flight. Either that, or our bookkeeping has gone horribly
+        // wrong! Wait on the 2nd last frame in flight to complete. This
+        // shunts us to F=1.
         const auto second_iter = std::next(std::rbegin(this->in_flight_frames));
         assert(second_iter != std::rend(this->in_flight_frames));
 
-        const auto& frame = (*second_iter)->end.sequence;
         const auto swi = VkSemaphoreWaitInfo{
             .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
             .semaphoreCount = 1,
             .pSemaphores = &this->semaphore,
-            .pValues = &(*second_iter)->end.sequence,
+            .pValues = &second_iter->sequence,
         };
         vtable.WaitSemaphoresKHR(device.device, &swi,
                                  std::numeric_limits<std::uint64_t>::max());
@@ -382,6 +338,8 @@ void QueueContext::sleep_in_present() {
         return;
     }
 
+    // We are checking size again because process_frames might have drained
+    // it to zero.
     if (!std::size(this->in_flight_frames)) {
         return;
     }
@@ -397,34 +355,34 @@ void QueueContext::sleep_in_present() {
         std::ranges::transform(this->timings, std::back_inserter(vect),
                                [](const auto& timing) { return timing.get(); });
         std::ranges::sort(vect, [](const auto& a, const auto& b) {
-            return a->gpu_time < b->gpu_time;
+            return a->gputime < b->gputime;
         });
         // return vect[0]->frametime;
-        return vect[std::size(vect) / 2]->gpu_time;
+        return vect[std::size(vect) / 2]->gputime;
     }();
 
-    const auto expected_cputime = [&, this]() {
+    const auto expected_not_gputime = [&, this]() {
         auto vect = std::vector<Timing*>{};
         std::ranges::transform(this->timings, std::back_inserter(vect),
                                [](const auto& timing) { return timing.get(); });
         std::ranges::sort(vect, [](const auto& a, const auto& b) {
-            return a->cpu_time < b->cpu_time;
+            return a->not_gputime < b->not_gputime;
         });
         // return vect[0]->frametime;
-        return vect[std::size(vect) / 2]->cpu_time;
+        return vect[std::size(vect) / 2]->not_gputime;
     }();
+
     std::cerr << "    expected gputime: ";
     debug_log_time(expected_gputime);
-    std::cerr << "    expected cputime: ";
-    debug_log_time(expected_cputime);
+    std::cerr << "    expected not_gputime: ";
+    debug_log_time(expected_not_gputime);
 
     //                               PRESENT CALL
-    // |--------------|-------------------|----------------|
-    // a        swap_acquire              b                c
+    // |----------------------------------|----------------|
+    // first                              b                c
     //
-    // Us, the CPU on the host, is approximately at 'b'. We have a good guess
-    // for the distance between a and b as gputime. Our educated guess for the
-    // distance between b and c is cputime. The GPU is between a and b.
+    // Us, the CPU on the host, is approximately at 'b'. We have a good
+    // guess for the distance between a and b as gputime.
 
     const auto& frame = this->in_flight_frames.back();
 
@@ -432,137 +390,49 @@ void QueueContext::sleep_in_present() {
     // It's impossible to make a decision until we know a.
     // Doing this is fine because it won't affect throughput at all.
     // (ie, there's more work queued after regardless).
+    // FIXME: If a == b, then we're waiting for the entire queue
+    // to finish because the semaphore only says if it has finished.
+    // The fix is to check the start timestamp instead of the query
+    // in the case that it's...
+    // Honestly it might be better to signal two semaphores because
+    // we need to wait for when the submission starts work and
+    // right now, we only signal when the submission finishes work.
+    // Ideally we have both, so we can elegantly wait on the start
+    // semaphore of A, then get A's start timestamp. This is BROKEN.
+
     [&]() -> void {
         const auto swi = VkSemaphoreWaitInfo{
             .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
             .semaphoreCount = 1,
             .pSemaphores = &this->semaphore,
-            .pValues = &frame->start.sequence,
+            .pValues = &frame.submissions.front()->sequence,
         };
         vtable.WaitSemaphoresKHR(device.device, &swi,
                                  std::numeric_limits<std::uint64_t>::max());
     }();
 
-    // We now know that A is available because its semaphore has been signalled.
-    const auto a_handle = frame->start.handle;
-    const auto& a_context = frame->start.context;
+    // We now know that A is available because its semaphore has been
+    // signalled.
 
-    const auto a_ticks_opt = a_handle->get_ticks(*a_context.timestamp_pool);
-    assert(a_ticks_opt.has_value());
-    const auto& a_clock = a_context.device_context.clock;
-    const auto a = a_clock.ticks_to_time(*a_ticks_opt);
+    const auto a = [&, this]() {
+        const auto& handle = frame.submissions.front()->start_handle;
+        const auto ticks = handle->get_ticks(*this->timestamp_pool);
+        assert(ticks.has_value());
+        return device_context.clock.ticks_to_time(*ticks);
+    }();
 
     const auto now = std::chrono::steady_clock::now();
     const auto dist = now - a;
-    const auto expected = expected_gputime - dist - expected_cputime;
+    const auto expected = expected_gputime - dist;
 
     const auto swi = VkSemaphoreWaitInfo{
         .sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO,
         .semaphoreCount = 1,
         .pSemaphores = &this->semaphore,
-        .pValues = &frame->end.sequence,
+        .pValues = &frame.sequence,
     };
     vtable.WaitSemaphoresKHR(device.device, &swi,
                              std::max(expected.count(), 0l));
-
-    /*
-
-        // 2% of average gpu time for dealing with variance.
-        // This could be calculated more precisely with the
-        // numbers we have (like we could construct a high% confidence
-        // interval? not big on maths).
-        const auto slack = gputime / 50;
-
-        // All educated guesses:
-        //  dist_to_b = gputime - dist_to_last_b;
-        //  dist_to_last_b = now - last_b
-        //  sleep_until = now + extra_delay + slack + dist_to_b
-        //              = now + extra_delay + slack + (frametime -
-       dist_to_last_b)
-        //              = now + extra_delay + slack + frametime - (now - last_b)
-
-        const auto now = std::chrono::steady_clock::now();
-        assert(last_b <= now);
-        const auto dist = now - last_b;
-        // Even if this is negative, it's a no-op to sleep backwards.
-        const auto sleep_target =
-            now + extra_delay + slack + median_frametime - dist;
-            */
-
-    /*
-    std::cerr << " SLEEPING FOR: ";
-    debug_log_time(sleep_target - now);
-    std::this_thread::sleep_until(sleep_target);
-    */
-
-    /*
-    const auto get_tick_time = [](const auto& timepoint)
-        -> std::optional<DeviceContext::Clock::time_point_t> {
-        const auto& handle = timepoint.handle;
-        const auto& context = timepoint.context;
-
-        const auto ticks = handle->get_ticks(*context.timestamp_pool);
-        if (!ticks.has_value()) {
-            return std::nullopt;
-        }
-        const auto& clock = context.device_context.clock;
-        return clock.ticks_to_time(*ticks);
-    };
-
-    const auto finished_time =
-        get_tick_time(this->in_flight_frames.back()->end);
-    const auto after = std::chrono::steady_clock::now();
-    if (!finished_time.has_value()) {
-        std::cerr << "didnt finish late!\n";
-    } else {
-        std::cerr << "finished late by: ";
-        const auto late_time = after - *finished_time;
-    }
-
-    const auto get_tick_time2 = [this](const auto& handle)
-        -> std::optional<DeviceContext::Clock::time_point_t> {
-        const auto& context = *this;
-
-        const auto ticks = handle->get_ticks(*context.timestamp_pool);
-        if (!ticks.has_value()) {
-            return std::nullopt;
-        }
-        const auto& clock = context.device_context.clock;
-        return clock.ticks_to_time(*ticks);
-    };
-
-    this->submissions.back()->debug += " last_in_wait";
-
-    std::cerr << "---------------SUBMISSION READOUT POST FRAME SUBMIT AND "
-                 "WAIT--------------------\n";
-    auto i = std::size_t{0};
-    for (auto it = std::rbegin(this->submissions);
-         i < 20 && it != std::rend(this->submissions); ++it, ++i) {
-
-        std::cerr << "    submission -" << i << '\n';
-        const auto& submission = **it;
-
-        std::cerr << "        sequence target: " << submission.sequence << '\n';
-
-        std::cerr << "        debug tags: " << submission.debug << '\n';
-
-        const auto end_time = get_tick_time2(submission.end_handle);
-        std::cerr << "        end_time: ";
-        if (end_time.has_value()) {
-            debug_log_time(after - *end_time);
-        } else {
-            std::cerr << "not finished yet!\n";
-        }
-
-        const auto start_time = get_tick_time2(submission.start_handle);
-        std::cerr << "        start_time: ";
-        if (start_time.has_value()) {
-            debug_log_time(after - *start_time);
-        } else {
-            std::cerr << "not finished yet!\n";
-        }
-    }
-    */
 }
 
 } // namespace low_latency
 \ No newline at end of file
diff --git a/src/queue_context.hh b/src/queue_context.hh
index 172683d..ed13465 100644
--- a/src/queue_context.hh
+++ b/src/queue_context.hh
@@ -31,7 +31,8 @@ class QueueContext final : public Context {
 
   private:
     static constexpr auto MAX_TRACKED_TIMINGS = 50;
-    // Potentially in flight queue submissions
+
+    // Potentially in flight queue submissions that come from this queue.
     struct Submission {
         const std::unordered_set<VkSemaphore> signals;
         const std::unordered_set<VkSemaphore> waits;
@@ -41,32 +42,26 @@ class QueueContext final : public Context {
 
         std::uint64_t sequence;
 
-        bool end_of_frame_marker = false;
         std::string debug;
     };
-    std::deque<std::shared_ptr<Submission>> submissions;
+    using submission_ptr_t = std::shared_ptr<Submission>;
+    std::deque<submission_ptr_t> submissions;
 
-    // In flight frames!
-    // These might come from different contexts.
+    // In flight frame submissions grouped together.
+    // The first element in the vector refers to the first submission that
+    // contributed to that frame. The last element is the last submission before
+    // present was called.
     struct Frame {
-
-        struct Timepoint {
-            const QueueContext& context;
-            const std::shared_ptr<TimestampPool::Handle> handle;
-            std::uint64_t sequence;
-        };
-
-        const Timepoint start;
-        const Timepoint end;
+        std::deque<submission_ptr_t> submissions;
+        std::uint64_t sequence;
     };
-    std::deque<std::unique_ptr<Frame>> in_flight_frames;
+    std::deque<Frame> in_flight_frames;
 
+    // Completed frames.
     struct Timing {
-        DeviceContext::Clock::time_point_t gpu_end;
+        DeviceContext::Clock::time_point_t::duration gputime, not_gputime;
 
-        DeviceContext::Clock::time_point_t::duration gpu_time, cpu_time;
-        
-        std::unique_ptr<Frame> frame;
+        Frame frame;
     };
     std::deque<std::unique_ptr<Timing>> timings;
 
@@ -80,14 +75,12 @@ class QueueContext final : public Context {
 
   public:
     void
-    notify_submit(const VkSubmitInfo& info,
-                  const std::uint64_t& sequence,
+    notify_submit(const VkSubmitInfo& info, const std::uint64_t& sequence,
                   const std::shared_ptr<TimestampPool::Handle> head_handle,
                   const std::shared_ptr<TimestampPool::Handle> tail_handle);
 
     void
-    notify_submit(const VkSubmitInfo2& info,
-                  const std::uint64_t& sequence,
+    notify_submit(const VkSubmitInfo2& info, const std::uint64_t& sequence,
                   const std::shared_ptr<TimestampPool::Handle> head_handle,
                   const std::shared_ptr<TimestampPool::Handle> tail_handle);