src/queue_context.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402

#include "queue_context.hh"
#include "device_context.hh"
#include "layer_context.hh"
#include "timestamp_pool.hh"

#include <algorithm>
#include <chrono>
#include <fstream>
#include <iostream>
#include <ranges>
#include <span>
#include <vulkan/vulkan_core.h>

namespace low_latency {

QueueContext::QueueContext(DeviceContext& device_context, const VkQueue& queue,
                           const std::uint32_t& queue_family_index)
    : device_context(device_context), queue(queue),
      queue_family_index(queue_family_index) {

    // Important we make the command pool before the timestamp pool, because
    // it's a dependency.
    this->command_pool = [&]() {
        const auto cpci = VkCommandPoolCreateInfo{
            .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
            .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
                     VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
            .queueFamilyIndex = queue_family_index,
        };

        auto command_pool = VkCommandPool{};
        THROW_NON_VKSUCCESS(device_context.vtable.CreateCommandPool(
            device_context.device, &cpci, nullptr, &command_pool));
        return command_pool;
    }();

    // Only construct a timestamp pool if we support it!
    if (device_context.physical_device.supports_required_extensions) {
        this->timestamp_pool = std::make_unique<TimestampPool>(*this);
    }
}

QueueContext::~QueueContext() {
    this->in_flight_frames.clear();
    this->submissions.clear();
    this->timestamp_pool.reset();

    const auto& vtable = this->device_context.vtable;
    vtable.DestroyCommandPool(this->device_context.device, this->command_pool,
                              nullptr);
}

void QueueContext::notify_submit(
    const VkSubmitInfo& info,
    const std::shared_ptr<TimestampPool::Handle> head_handle,
    const std::shared_ptr<TimestampPool::Handle> tail_handle,
    const DeviceContext::Clock::time_point_t& now) {

    auto signals = std::unordered_set<VkSemaphore>{};
    auto waits = std::unordered_set<VkSemaphore>{};
    std::ranges::copy(std::span{info.pWaitSemaphores, info.waitSemaphoreCount},
                      std::inserter(waits, std::end(waits)));
    std::ranges::copy(
        std::span{info.pSignalSemaphores, info.signalSemaphoreCount},
        std::inserter(signals, std::end(signals)));

    this->submissions.emplace_back(std::make_unique<Submission>(
        std::move(signals), std::move(waits), head_handle, tail_handle, now));

    if (std::size(this->submissions) > this->MAX_TRACKED_SUBMISSIONS) {
        this->submissions.pop_front();
    }
}

// Identical to notify_submit, but we use VkSubmitInfo2.
void QueueContext::notify_submit(
    const VkSubmitInfo2& info,
    const std::shared_ptr<TimestampPool::Handle> head_handle,
    const std::shared_ptr<TimestampPool::Handle> tail_handle,
    const DeviceContext::Clock::time_point_t& now) {

    auto signals = std::unordered_set<VkSemaphore>{};
    auto waits = std::unordered_set<VkSemaphore>{};

    std::ranges::transform(
        std::span{info.pWaitSemaphoreInfos, info.waitSemaphoreInfoCount},
        std::inserter(waits, std::end(waits)),
        [](const auto& info) -> auto { return info.semaphore; });

    std::ranges::transform(
        std::span{info.pSignalSemaphoreInfos, info.signalSemaphoreInfoCount},
        std::inserter(signals, std::end(signals)),
        [](const auto& info) -> auto { return info.semaphore; });

    this->submissions.emplace_back(std::make_unique<Submission>(
        std::move(signals), std::move(waits), head_handle, tail_handle, now));

    if (std::size(this->submissions) > this->MAX_TRACKED_SUBMISSIONS) {
        this->submissions.pop_front();
    }
}

void QueueContext::drain_submissions_to_frame() {

    // We are going to assume that all queue submissions before and on the same
    // queue contribute to the frame.

    // This used to be more complicated where we found the first submission that
    // was signalled by acquire, then we walked forwards until we found the
    // submission before it that marked the end of frame (which was the last
    // submission in the previous frame that called notify submit). This seemed
    // completely redundant, in all cases it was exactly what we have here. But
    // I could be wrong.

    const auto start_iter = std::begin(this->submissions);
    // no op submit?
    if (start_iter == std::end(this->submissions)) {
        return;
    }

    // The last submission is either in flight, already processed, or we
    // just happen to be the first frame and we can just set it to our start
    // with little consequence.
    const auto prev_frame_last_submit = [&]() -> auto {
        if (const auto iter = std::rbegin(this->in_flight_frames);
            iter != std::rend(this->in_flight_frames)) {

            assert(!iter->submissions.empty());
            return iter->submissions.back();
        }

        if (const auto iter = std::rbegin(this->timings);
            iter != std::rend(this->timings)) {

            const auto& submissions = (*iter)->frame.submissions;
            assert(!submissions.empty());

            return submissions.back();
        }

        return *start_iter;
    }();

    this->in_flight_frames.emplace_back(
        Frame{.submissions = std::move(this->submissions),
              .cpu_post_present_time = std::chrono::steady_clock::now()});
    assert(std::size(this->in_flight_frames.back().submissions));
    // *valid but unspecified state after move, so clear!*
    this->submissions.clear();
}

void QueueContext::notify_present(const VkPresentInfoKHR& info) {
    this->drain_submissions_to_frame();
    this->drain_frames_to_timings();

    // Call up to notify the device now that we're done with this frame.
    // We have to do this because antilag 2 data is sent to the device, not
    // any particular queue.
    this->device_context.notify_queue_present(*this);

    // We should only sleep in present if two conditions are met:
    //     1. Our antilag_mode isn't set to on, because otherwise the sleep will
    //        be done in input and with far better results.
    //     2. The 'is_antilag_1_enabled' flag, which exists at the layer's
    //        context, is set.
    if (this->device_context.antilag_mode != VK_ANTI_LAG_MODE_ON_AMD &&
        this->device_context.instance.layer.is_antilag_1_enabled) {

        this->sleep_in_present();
    }
}

const auto debug_log_time2 = [](auto& stream, const auto& diff) {
    using namespace std::chrono;
    const auto ms = duration_cast<milliseconds>(diff);
    const auto us = duration_cast<microseconds>(diff - ms);
    const auto ns = duration_cast<nanoseconds>(diff - ms - us);
    stream << ms << " " << us << " " << ns << " ago\n";
};

void QueueContext::drain_frames_to_timings() {
    if (!std::size(this->in_flight_frames)) {
        return;
    }

    // Only need to calibrate this device, we don't support multi device anti
    // lag.
    this->device_context.clock->calibrate();

    while (std::size(this->in_flight_frames)) {
        const auto& frame = this->in_flight_frames.front();

        assert(std::size(frame.submissions));

        const auto& last_submission = frame.submissions.back();

        // Not completed (so future frames definitely aren't) - stop early.
        if (!last_submission->end_handle->get_time().has_value()) {
            break;
        }

        // We are committed to removing the frame at this stage and
        // promoting it to a 'timing' struct because it's completed.
        // We can guarantee that we can extract timing information from
        // all start/end handles now.

        // Using leetcode merge intervals in the wild lol
        struct Interval {
            DeviceContext::Clock::time_point_t start, end;
        };

        const auto sorted_intervals = [&]() -> auto {
            auto intervals = std::vector<Interval>{};
            std::ranges::transform(
                frame.submissions, std::back_inserter(intervals),
                [&](const auto& submission) {
                    return Interval{
                        .start = submission->start_handle->get_time_required(),
                        .end = submission->end_handle->get_time_required(),
                    };
                });

            std::ranges::sort(intervals, [](const auto& a, const auto& b) {
                return a.start < b.start;
            });
            return intervals;
        }();

        const auto merged = [&]() -> auto {
            auto merged = std::vector<Interval>{};
            auto last = sorted_intervals[0];

            for (const auto& [s, e] : sorted_intervals | std::views::drop(1)) {
                if (s <= last.end) {
                    last.end = std::max(last.end, e);
                } else {
                    merged.push_back(last);
                    last = {s, e};
                }
            }
            merged.push_back(last);
            return merged;
        }();

        // It's important to note that gputime starts from a point which isn't
        // equal to the below 'start' var. It looks something like this, where a
        // '-' represents CPU time only and '=' represents CPU + GPU.
        //
        //   |---------------------|=========|--------|====|-----------------|
        //   ^ last_present        ^ merged.front().start            present ^
        //                               merged.back().end ^
        //
        // I would imagine there would be more GPU than cpu to reach the anti
        // lag codepath than is depicted here. We can track the total time
        // between vkPresent calls as future_submit - last_submit. The total
        // time the GPU spent engaged is the sum of all intervals. So we can
        // get a meaningful 'not_gputime' as total - gpu_time.

        const auto gputime = std::ranges::fold_left(
            merged, DeviceContext::Clock::time_point_t::duration{},
            [](auto gputime, const auto& interval) {
                const auto& [start, end] = interval;
                return gputime + (end - start);
            });

        // Our cpu_start value here refers to the time when the CPU was allowed
        // to move past the present call and, in theory, begin cpu work on the
        // next frame.
        const auto cpu_start = [&]() -> auto {
            if (const auto it = std::rbegin(this->timings);
                it != std::rend(this->timings)) {

                return (*it)->frame.cpu_post_present_time;
            }
            // This will happen once, only for the first frame. We don't
            // have a way of knowing when the CPU first started work here.
            // Just return our first submit's start for this edge case.
            return frame.submissions.front()->start_handle->get_time_required();
        }();

        const auto cputime =
            frame.submissions.front()->enqueued_time - cpu_start;

        this->timings.emplace_back(std::make_unique<Timing>(Timing{
            .gputime = gputime,
            .cputime = cputime,
            .frame = frame,
        }));

        this->in_flight_frames.pop_front();
    }

    if (const auto T = std::size(this->timings);
        T > this->MAX_TRACKED_TIMINGS) {

        const auto dist = T - this->MAX_TRACKED_TIMINGS;
        const auto erase_to_iter = std::next(std::begin(this->timings), dist);
        this->timings.erase(std::begin(this->timings), erase_to_iter);
    }
}

void QueueContext::sleep_in_present() {
    // After calling this, any remaining frames are truly in flight.
    this->drain_frames_to_timings();
    if (!std::size(this->in_flight_frames)) {
        return;
    }

    // This is getting the most recent frame and waiting until its start has
    // begun. This means that, in the case of >1 frame in flight, it's draining
    // all of them before we're allowed to move forward.
    const auto first_gpu_work = [&]() -> auto {
        const auto& most_recent_frame = this->in_flight_frames.back();
        const auto& first_submission = most_recent_frame.submissions.front();
        return first_submission->start_handle->get_time_spinlock();
    }();

    // Drain frames again because as stated above, we might have multiple frames
    // now completed after our wait spinlock.
    this->drain_frames_to_timings();

    // Check the size again because the frame we want to target may have already
    // completed when we called process_frames().
    if (!std::size(this->in_flight_frames)) {
        return;
    }
    assert(std::size(this->in_flight_frames) == 1);

    // Not enough data yet to apply any delays.
    if (std::size(this->timings) < this->MAX_TRACKED_TIMINGS) {
        return;
    }

    const auto calc_median = [&, this](const auto& getter) {
        auto vect = std::vector<Timing*>{};
        std::ranges::transform(this->timings, std::back_inserter(vect),
                               [](const auto& timing) { return timing.get(); });
        std::ranges::sort(vect, [&](const auto& a, const auto& b) {
            return getter(a) < getter(b);
        });
        return getter(vect[std::size(vect) / 2]);
    };

    const auto expected_gputime =
        calc_median([](const auto& timing) { return timing->gputime; });
    const auto expected_cputime =
        calc_median([](const auto& timing) { return timing->cputime; });

    // Should look like this:
    //              total_length = expected_gputime
    // |------------------------x------------------------------|
    // ^ first_gpu_work        now               last_gpu_work ^

    const auto now = std::chrono::steady_clock::now();
    const auto dist = now - first_gpu_work;
    const auto expected_dist_to_last = expected_gputime - dist;

    const auto wait_time = expected_dist_to_last - expected_cputime;

    auto& frame = this->in_flight_frames.back();
    const auto& last_gpu_work = frame.submissions.back()->end_handle;
    last_gpu_work->get_time_spinlock(now + wait_time);

    frame.cpu_post_present_time = std::chrono::steady_clock::now();

    std::ofstream f("/tmp/times.txt", std::ios::trunc);
    f << "    expected gputime: ";
    debug_log_time2(f, expected_gputime);
    f << "    expected cputime: ";
    debug_log_time2(f, expected_cputime);
    f << "    requestd sleep: ";
    debug_log_time2(f, wait_time);
    f << "    observed sleep: ";
    debug_log_time2(f, frame.cpu_post_present_time - now);
}

bool QueueContext::should_inject_timestamps() const {
    const auto& pd = this->device_context.physical_device;

    if (!pd.supports_required_extensions) {
        return false;
    }

    // Don't bother injecting timestamps during queue submission if both AL1 and
    // AL2 are disabled.
    if (!this->device_context.was_antilag_requested &&
        !pd.instance.layer.is_antilag_1_enabled) {

        return false;
    }

    assert(pd.queue_properties);
    const auto& queue_props = *pd.queue_properties;
    assert(this->queue_family_index < std::size(queue_props));

    const auto& props = queue_props[this->queue_family_index];
    // Probably need at least 64, don't worry about it just yet and just ensure
    // it's not zero (because that will cause a crash if we inject).
    return props.queueFamilyProperties.timestampValidBits;
}

} // namespace low_latency