src/device_context.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142

#include "device_context.hh"
#include "queue_context.hh"

#include <utility>
#include <vulkan/vulkan_core.h>

namespace low_latency {

DeviceContext::DeviceContext(InstanceContext& parent_instance,
                             PhysicalDeviceContext& parent_physical_device,
                             const VkDevice& device,
                             VkuDeviceDispatchTable&& vtable)
    : instance(parent_instance), physical_device(parent_physical_device),
      device(device), vtable(std::move(vtable)), clock(*this) {}

DeviceContext::~DeviceContext() {
    this->present_queue.reset();
    // We will let the destructor handle clearing here, but they should be
    // unique by now (ie, removed from the layer's context map).
    for (const auto& [queue, queue_context] : this->queues) {
        assert(queue_context.unique());
    }
}

DeviceContext::Clock::Clock(const DeviceContext& context) : device(context) {
    this->calibrate();
}

DeviceContext::Clock::~Clock() {}

void DeviceContext::Clock::calibrate() {
    const auto infos = std::vector<VkCalibratedTimestampInfoKHR>{
        {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr,
         VK_TIME_DOMAIN_DEVICE_EXT},
        {VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, nullptr,
         VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT}};

    struct CalibratedResult {
        std::uint64_t device;
        std::uint64_t host;
    };
    auto calibrated_result = CalibratedResult{};
    device.vtable.GetCalibratedTimestampsKHR(device.device, 2, std::data(infos),
                                             &calibrated_result.device,
                                             &this->error_bound);
    this->device_ticks = calibrated_result.device;
    this->host_ns = calibrated_result.host;
}

DeviceContext::Clock::time_point_t
DeviceContext::Clock::ticks_to_time(const std::uint64_t& ticks) const {
    const auto& pd = device.physical_device.properties;
    const auto ns_tick = static_cast<double>(pd->limits.timestampPeriod);

    const auto diff = [&]() -> auto {
        auto a = this->device_ticks;
        auto b = ticks;
        const auto is_negative = a > b;
        if (is_negative) {
            std::swap(a, b);
        }
        const auto abs_diff = b - a;
        assert(abs_diff <= std::numeric_limits<std::int64_t>::max());
        const auto signed_abs_diff = static_cast<std::int64_t>(abs_diff);
        return is_negative ? -signed_abs_diff : signed_abs_diff;
    }();

    // This will have issues because std::chrono::steady_clock::now(), which
    // we use for cpu time, may not be on the same time domain what was returned
    // by GetCalibratedTimestamps. It would be more robust to use the posix
    // gettime that vulkan guarantees it can be compared to instead.

    const auto diff_nsec = static_cast<std::int64_t>(diff * ns_tick + 0.5);
    const auto delta = std::chrono::nanoseconds(this->host_ns + diff_nsec);
    return time_point_t{delta};
}

void DeviceContext::notify_acquire(const VkSwapchainKHR& swapchain,
                                   const std::uint32_t& image_index,
                                   const VkSemaphore& signal_semaphore) {
    const auto it = this->swapchain_signals.try_emplace(swapchain).first;

    // Doesn't matter if it was already there, overwrite it.
    it->second.insert_or_assign(image_index, signal_semaphore);
}

void DeviceContext::sleep_in_input() {
    // Present hasn't happened yet, we don't know what queue to attack.
    if (!this->present_queue) {
        return;
    }

    const auto& frames = this->present_queue->in_flight_frames;
    // No frame here means we're behind the GPU and do not need to delay.
    // If anything we should speed up...
    if (!std::size(frames)) {
        return;
    }

    // If we're here, that means that there might be an outstanding frame that's
    // sitting on our present_queue which hasn't yet completed, so we need to
    // stall until it's finished.
    const auto& last_frame = frames.back();
    assert(std::size(last_frame.submissions));
    const auto& last_frame_submission = frames.back().submissions.back();
    last_frame_submission->end_handle->get_time_spinlock();

    // From our sleep in present implementation, just spinning until
    // the previous frame has completed did not work well. This was because
    // there was a delay between presentation and when new work was given
    // to the GPU. If we stalled the CPU without trying to account for this, we
    // would get huge frame drops, loss of throughput, and the GPU would even
    // clock down. So naturally I am concerned about this approach, but it seems
    // to perform well so far in my own testing and is just beautifully elegant.
}

void DeviceContext::notify_antilag_update(const VkAntiLagDataAMD& data) {
    this->antilag_mode = data.mode;
    this->antilag_fps = data.maxFPS; // TODO

    // This might not be provided (probably just to set some settings?).
    if (!data.pPresentationInfo) {
        return;
    }

    const auto& presentation_info = *data.pPresentationInfo;
    // Only care about the input stage for now.
    if (presentation_info.stage != VK_ANTI_LAG_STAGE_INPUT_AMD) {
        return;
    }

    if (this->antilag_mode == VK_ANTI_LAG_MODE_ON_AMD) {
        this->sleep_in_input();
    }
}

void DeviceContext::notify_queue_present(const QueueContext& queue) {
    assert(this->queues.contains(queue.queue));
    this->present_queue = this->queues[queue.queue];
}

} // namespace low_latency